mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	gguf : add ftype meta info to the model (#2710)
* llama : add ftype meta info to the model ggml-ci * convert.py : add ftype when converting (does not work) * convert.py : fix Enum to IntEnum ggml-ci
This commit is contained in:
		
							
								
								
									
										29
									
								
								convert.py
									
									
									
									
									
								
							
							
						
						
									
										29
									
								
								convert.py
									
									
									
									
									
								
							| @@ -69,7 +69,10 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = { | |||||||
|     'I32': DT_I32, |     'I32': DT_I32, | ||||||
| } | } | ||||||
|  |  | ||||||
| class GGMLFileType(enum.Enum): | # TODO: match this with `llama_ftype` | ||||||
|  | # TODO: rename to LLAMAFileType | ||||||
|  | # TODO: move to `gguf.py` | ||||||
|  | class GGMLFileType(enum.IntEnum): | ||||||
|     AllF32    = 0 |     AllF32    = 0 | ||||||
|     MostlyF16 = 1  # except 1d tensors |     MostlyF16 = 1  # except 1d tensors | ||||||
|  |  | ||||||
| @@ -101,6 +104,8 @@ class Params: | |||||||
|     n_head_kv:  int |     n_head_kv:  int | ||||||
|     f_norm_eps: float |     f_norm_eps: float | ||||||
|  |  | ||||||
|  |     ftype: Optional[GGMLFileType] = None | ||||||
|  |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def find_n_mult(n_ff: int, n_embd: int) -> int: |     def find_n_mult(n_ff: int, n_embd: int) -> int: | ||||||
|         # hardcoded magic range |         # hardcoded magic range | ||||||
| @@ -738,6 +743,9 @@ class OutputFile: | |||||||
|         self.gguf.add_head_count_kv       (params.n_head_kv) |         self.gguf.add_head_count_kv       (params.n_head_kv) | ||||||
|         self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps) |         self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps) | ||||||
|  |  | ||||||
|  |         if params.ftype: | ||||||
|  |             self.gguf.add_file_type(params.ftype) | ||||||
|  |  | ||||||
|     def add_meta_vocab(self, vocab: Vocab) -> None: |     def add_meta_vocab(self, vocab: Vocab) -> None: | ||||||
|         tokens = [] |         tokens = [] | ||||||
|         scores = [] |         scores = [] | ||||||
| @@ -1020,6 +1028,12 @@ def main(args_in: Optional[List[str]] = None) -> None: | |||||||
|                             " - LLaMA v2: --ctx 4096\n") |                             " - LLaMA v2: --ctx 4096\n") | ||||||
|         params.n_ctx = args.ctx |         params.n_ctx = args.ctx | ||||||
|  |  | ||||||
|  |     if args.outtype: | ||||||
|  |         params.ftype = { | ||||||
|  |             "f32": GGMLFileType.AllF32, | ||||||
|  |             "f16": GGMLFileType.MostlyF16, | ||||||
|  |         }[args.outtype] | ||||||
|  |  | ||||||
|     print(f"params = {params}") |     print(f"params = {params}") | ||||||
|  |  | ||||||
|     vocab: Vocab |     vocab: Vocab | ||||||
| @@ -1040,11 +1054,14 @@ def main(args_in: Optional[List[str]] = None) -> None: | |||||||
|             vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent |             vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent | ||||||
|             vocab = load_vocab(vocab_dir, args.vocabtype) |             vocab = load_vocab(vocab_dir, args.vocabtype) | ||||||
|  |  | ||||||
|         model       = model_plus.model |         model   = model_plus.model | ||||||
|         model       = convert_model_names(model, params) |         model   = convert_model_names(model, params) | ||||||
|         output_type = pick_output_type(model, args.outtype) |         ftype   = pick_output_type(model, args.outtype) | ||||||
|         model       = convert_to_output_type(model, output_type) |         model   = convert_to_output_type(model, ftype) | ||||||
|         outfile     = args.outfile or default_outfile(model_plus.paths, output_type) |         outfile = args.outfile or default_outfile(model_plus.paths, ftype) | ||||||
|  |  | ||||||
|  |         params.ftype = ftype | ||||||
|  |         print(f"Writing {outfile}, format {ftype}") | ||||||
|  |  | ||||||
|         OutputFile.write_all(outfile, params, model, vocab) |         OutputFile.write_all(outfile, params, model, vocab) | ||||||
|         print(f"Wrote {outfile}") |         print(f"Wrote {outfile}") | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								gguf.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								gguf.py
									
									
									
									
									
								
							| @@ -26,6 +26,7 @@ KEY_GENERAL_DESCRIPTION          = "general.description" | |||||||
| KEY_GENERAL_LICENSE              = "general.license" | KEY_GENERAL_LICENSE              = "general.license" | ||||||
| KEY_GENERAL_SOURCE_URL           = "general.source.url" | KEY_GENERAL_SOURCE_URL           = "general.source.url" | ||||||
| KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository" | KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository" | ||||||
|  | KEY_GENERAL_FILE_TYPE            = "general.file_type" | ||||||
|  |  | ||||||
| # LLM | # LLM | ||||||
| KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length" | KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length" | ||||||
| @@ -595,6 +596,9 @@ class GGUFWriter: | |||||||
|     def add_source_hf_repo(self, repo: str): |     def add_source_hf_repo(self, repo: str): | ||||||
|         self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo) |         self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo) | ||||||
|  |  | ||||||
|  |     def add_file_type(self, ftype: int): | ||||||
|  |         self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype) | ||||||
|  |  | ||||||
|     def add_name(self, name: str): |     def add_name(self, name: str): | ||||||
|         self.add_string(KEY_GENERAL_NAME, name) |         self.add_string(KEY_GENERAL_NAME, name) | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -995,6 +995,16 @@ struct llama_model_loader { | |||||||
|                      } break; |                      } break; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  |             // this is a way to mark that we have "guessed" the file type | ||||||
|  |             ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); | ||||||
|  |  | ||||||
|  |             { | ||||||
|  |                 const int kid = gguf_find_key(ctx_gguf, "general.file_type"); | ||||||
|  |                 if (kid >= 0) { | ||||||
|  |                     ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|             for (int i = 0; i < n_kv; i++) { |             for (int i = 0; i < n_kv; i++) { | ||||||
|                 const char * name         = gguf_get_key(ctx_gguf, i); |                 const char * name         = gguf_get_key(ctx_gguf, i); | ||||||
|                 const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); |                 const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); | ||||||
| @@ -1197,7 +1207,11 @@ struct llama_model_loader { | |||||||
| // load LLaMA models | // load LLaMA models | ||||||
| // | // | ||||||
|  |  | ||||||
| const char * llama_model_ftype_name(enum llama_ftype ftype) { | std::string llama_model_ftype_name(enum llama_ftype ftype) { | ||||||
|  |     if (ftype & LLAMA_FTYPE_GUESSED) { | ||||||
|  |         return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     switch (ftype) { |     switch (ftype) { | ||||||
|         case LLAMA_FTYPE_ALL_F32:     return "all F32"; |         case LLAMA_FTYPE_ALL_F32:     return "all F32"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16"; |         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16"; | ||||||
| @@ -1426,7 +1440,7 @@ static void llama_model_load_internal( | |||||||
|         LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, hparams.rope_freq_base); |         LLAMA_LOG_INFO("%s: freq_base    = %.1f\n",   __func__, hparams.rope_freq_base); | ||||||
|         LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, hparams.rope_freq_scale); |         LLAMA_LOG_INFO("%s: freq_scale   = %g\n",     __func__, hparams.rope_freq_scale); | ||||||
|         LLAMA_LOG_INFO("%s: model type   = %s\n",     __func__, llama_model_type_name(model.type)); |         LLAMA_LOG_INFO("%s: model type   = %s\n",     __func__, llama_model_type_name(model.type)); | ||||||
|         LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype)); |         LLAMA_LOG_INFO("%s: model ftype  = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str()); | ||||||
|         LLAMA_LOG_INFO("%s: model size   = %.2f B\n", __func__, ml->n_elements*1e-9); |         LLAMA_LOG_INFO("%s: model size   = %.2f B\n", __func__, ml->n_elements*1e-9); | ||||||
|  |  | ||||||
|         // general kv |         // general kv | ||||||
| @@ -3450,6 +3464,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|     // copy the KV pairs from the input file |     // copy the KV pairs from the input file | ||||||
|     gguf_set_kv     (ctx_out, model_loader->ctx_gguf); |     gguf_set_kv     (ctx_out, model_loader->ctx_gguf); | ||||||
|     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); |     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); | ||||||
|  |     gguf_set_val_u32(ctx_out, "general.file_type", ftype); | ||||||
|  |  | ||||||
| #ifdef GGML_USE_K_QUANTS | #ifdef GGML_USE_K_QUANTS | ||||||
|     int n_attention_wv    = 0; |     int n_attention_wv    = 0; | ||||||
| @@ -4310,7 +4325,7 @@ int llama_model_n_embd(const struct llama_model * model) { | |||||||
| } | } | ||||||
|  |  | ||||||
| int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { | int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { | ||||||
|     return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype)); |     return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); | ||||||
| } | } | ||||||
|  |  | ||||||
| int llama_model_quantize( | int llama_model_quantize( | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								llama.h
									
									
									
									
									
								
							| @@ -103,6 +103,8 @@ extern "C" { | |||||||
|         LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors |         LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors | ||||||
|         LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors |         LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors | ||||||
|         LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors |         LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors | ||||||
|  |  | ||||||
|  |         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     typedef struct llama_token_data { |     typedef struct llama_token_data { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov