mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	YaRN : store rope scaling type as int32_t in memory (#5285)
* YaRN : store rope scaling type as int32_t in memory * llama : store mapped names as const char *
This commit is contained in:
		| @@ -75,8 +75,7 @@ struct gpt_params { | |||||||
|     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim |     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim | ||||||
|     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim |     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim | ||||||
|     int32_t yarn_orig_ctx         = 0;     // YaRN original context length |     int32_t yarn_orig_ctx         = 0;     // YaRN original context length | ||||||
|     int8_t  rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment |     int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED; | ||||||
|                                                                     //       pinging @cebtenzzre |  | ||||||
|  |  | ||||||
|     // // sampling parameters |     // // sampling parameters | ||||||
|     struct llama_sampling_params sparams; |     struct llama_sampling_params sparams; | ||||||
|   | |||||||
							
								
								
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -208,7 +208,7 @@ enum llm_arch { | |||||||
|     LLM_ARCH_UNKNOWN, |     LLM_ARCH_UNKNOWN, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static std::map<llm_arch, std::string> LLM_ARCH_NAMES = { | static std::map<llm_arch, const char *> LLM_ARCH_NAMES = { | ||||||
|     { LLM_ARCH_LLAMA,           "llama"     }, |     { LLM_ARCH_LLAMA,           "llama"     }, | ||||||
|     { LLM_ARCH_FALCON,          "falcon"    }, |     { LLM_ARCH_FALCON,          "falcon"    }, | ||||||
|     { LLM_ARCH_GPT2,            "gpt2"      }, |     { LLM_ARCH_GPT2,            "gpt2"      }, | ||||||
| @@ -285,7 +285,7 @@ enum llm_kv { | |||||||
|     LLM_KV_TOKENIZER_RWKV, |     LLM_KV_TOKENIZER_RWKV, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static std::map<llm_kv, std::string> LLM_KV_NAMES = { | static std::map<llm_kv, const char *> LLM_KV_NAMES = { | ||||||
|     { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  }, |     { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  }, | ||||||
|     { LLM_KV_GENERAL_QUANTIZATION_VERSION,  "general.quantization_version"          }, |     { LLM_KV_GENERAL_QUANTIZATION_VERSION,  "general.quantization_version"          }, | ||||||
|     { LLM_KV_GENERAL_ALIGNMENT,             "general.alignment"                     }, |     { LLM_KV_GENERAL_ALIGNMENT,             "general.alignment"                     }, | ||||||
| @@ -346,7 +346,7 @@ struct LLM_KV { | |||||||
|     llm_arch arch; |     llm_arch arch; | ||||||
|  |  | ||||||
|     std::string operator()(llm_kv kv) const { |     std::string operator()(llm_kv kv) const { | ||||||
|         return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); |         return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]); | ||||||
|     } |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -747,13 +747,13 @@ struct LLM_TN { | |||||||
| // gguf helpers | // gguf helpers | ||||||
| // | // | ||||||
|  |  | ||||||
| static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = { | static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = { | ||||||
|     { LLAMA_ROPE_SCALING_NONE,   "none"   }, |     { LLAMA_ROPE_SCALING_NONE,   "none"   }, | ||||||
|     { LLAMA_ROPE_SCALING_LINEAR, "linear" }, |     { LLAMA_ROPE_SCALING_LINEAR, "linear" }, | ||||||
|     { LLAMA_ROPE_SCALING_YARN,   "yarn"   }, |     { LLAMA_ROPE_SCALING_YARN,   "yarn"   }, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static int8_t llama_rope_scaling_type_from_string(const std::string & name) { | static int32_t llama_rope_scaling_type_from_string(const std::string & name) { | ||||||
|     for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { |     for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { | ||||||
|         if (kv.second == name) { |         if (kv.second == name) { | ||||||
|             return kv.first; |             return kv.first; | ||||||
| @@ -1415,6 +1415,7 @@ static const size_t GiB = 1024*MiB; | |||||||
|  |  | ||||||
| struct llama_hparams { | struct llama_hparams { | ||||||
|     bool     vocab_only; |     bool     vocab_only; | ||||||
|  |     bool     rope_finetuned; | ||||||
|     uint32_t n_vocab; |     uint32_t n_vocab; | ||||||
|     uint32_t n_ctx_train; // context size the model was trained on |     uint32_t n_ctx_train; // context size the model was trained on | ||||||
|     uint32_t n_embd; |     uint32_t n_embd; | ||||||
| @@ -1434,8 +1435,7 @@ struct llama_hparams { | |||||||
|     float    rope_freq_base_train; |     float    rope_freq_base_train; | ||||||
|     float    rope_freq_scale_train; |     float    rope_freq_scale_train; | ||||||
|     uint32_t n_yarn_orig_ctx; |     uint32_t n_yarn_orig_ctx; | ||||||
|     int8_t   rope_scaling_type_train : 3; |     int32_t  rope_scaling_type_train; | ||||||
|     bool     rope_finetuned : 1; |  | ||||||
|  |  | ||||||
|     float f_clamp_kqv; |     float f_clamp_kqv; | ||||||
|     float f_max_alibi_bias; |     float f_max_alibi_bias; | ||||||
| @@ -2701,7 +2701,7 @@ struct llama_model_loader { | |||||||
| // load LLaMA models | // load LLaMA models | ||||||
| // | // | ||||||
|  |  | ||||||
| static std::string llama_model_arch_name(llm_arch arch) { | static const char * llama_model_arch_name(llm_arch arch) { | ||||||
|     auto it = LLM_ARCH_NAMES.find(arch); |     auto it = LLM_ARCH_NAMES.find(arch); | ||||||
|     if (it == LLM_ARCH_NAMES.end()) { |     if (it == LLM_ARCH_NAMES.end()) { | ||||||
|         return "unknown"; |         return "unknown"; | ||||||
| @@ -3310,11 +3310,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { | |||||||
|     const auto & hparams = model.hparams; |     const auto & hparams = model.hparams; | ||||||
|     const auto & vocab   = model.vocab; |     const auto & vocab   = model.vocab; | ||||||
|  |  | ||||||
|     const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); |     const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); | ||||||
|  |  | ||||||
|     // hparams |     // hparams | ||||||
|     LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver)); |     LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver)); | ||||||
|     LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); |     LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch)); | ||||||
|     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, llama_model_vocab_type_name(vocab.type)); |     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, llama_model_vocab_type_name(vocab.type)); | ||||||
|     LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab); |     LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab); | ||||||
|     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (int) vocab.bpe_ranks.size()); |     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (int) vocab.bpe_ranks.size()); | ||||||
| @@ -3336,7 +3336,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { | |||||||
|     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff); |     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff); | ||||||
|     LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert); |     LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert); | ||||||
|     LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used); |     LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used); | ||||||
|     LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str()); |     LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type); | ||||||
|     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train); |     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train); | ||||||
|     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train); |     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train); | ||||||
|     LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx); |     LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx); | ||||||
| @@ -10735,7 +10735,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 | |||||||
|  |  | ||||||
| int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { | int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { | ||||||
|     return snprintf(buf, buf_size, "%s %s %s", |     return snprintf(buf, buf_size, "%s %s %s", | ||||||
|             llama_model_arch_name(model->arch).c_str(), |             llama_model_arch_name(model->arch), | ||||||
|             llama_model_type_name(model->type), |             llama_model_type_name(model->type), | ||||||
|             llama_model_ftype_name(model->ftype).c_str()); |             llama_model_ftype_name(model->ftype).c_str()); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								llama.h
									
									
									
									
									
								
							| @@ -213,7 +213,7 @@ extern "C" { | |||||||
|         uint32_t n_batch;           // prompt processing maximum batch size |         uint32_t n_batch;           // prompt processing maximum batch size | ||||||
|         uint32_t n_threads;         // number of threads to use for generation |         uint32_t n_threads;         // number of threads to use for generation | ||||||
|         uint32_t n_threads_batch;   // number of threads to use for batch processing |         uint32_t n_threads_batch;   // number of threads to use for batch processing | ||||||
|         int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` |         int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` | ||||||
|  |  | ||||||
|         // ref: https://github.com/ggerganov/llama.cpp/pull/2054 |         // ref: https://github.com/ggerganov/llama.cpp/pull/2054 | ||||||
|         float    rope_freq_base;   // RoPE base frequency, 0 = from model |         float    rope_freq_base;   // RoPE base frequency, 0 = from model | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jared Van Bortel
					Jared Van Bortel