mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	server : add "samplers" param to control the samplers order (#5494)
This commit is contained in:
		| @@ -341,7 +341,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { | |||||||
|                 break; |                 break; | ||||||
|             } |             } | ||||||
|             const auto sampler_names = string_split(argv[i], ';'); |             const auto sampler_names = string_split(argv[i], ';'); | ||||||
|             sparams.samplers_sequence = sampler_types_from_names(sampler_names); |             sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); | ||||||
|         } else if (arg == "--sampling-seq") { |         } else if (arg == "--sampling-seq") { | ||||||
|             if (++i >= argc) { |             if (++i >= argc) { | ||||||
|                 invalid_param = true; |                 invalid_param = true; | ||||||
| @@ -964,7 +964,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { | |||||||
|     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); |     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); | ||||||
|     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); |     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); | ||||||
|     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch); |     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch); | ||||||
|     printf("  --samplers            samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str()); |     printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n"); | ||||||
|  |     printf("                        (default: %s)\n", sampler_type_names.c_str()); | ||||||
|     printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); |     printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); | ||||||
|     printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); |     printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); | ||||||
|     printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); |     printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); | ||||||
| @@ -1133,34 +1134,50 @@ std::vector<std::string> string_split(std::string input, char separator) { | |||||||
|     return parts; |     return parts; | ||||||
| } | } | ||||||
|  |  | ||||||
| std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) { | std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) { | ||||||
|  |     std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map { | ||||||
|  |         {"top_k",       llama_sampler_type::TOP_K}, | ||||||
|  |         {"top_p",       llama_sampler_type::TOP_P}, | ||||||
|  |         {"typical_p",   llama_sampler_type::TYPICAL_P}, | ||||||
|  |         {"min_p",       llama_sampler_type::MIN_P}, | ||||||
|  |         {"tfs_z",       llama_sampler_type::TFS_Z}, | ||||||
|  |         {"temperature", llama_sampler_type::TEMPERATURE} | ||||||
|  |     }; | ||||||
|  |  | ||||||
|     // since samplers names are written multiple ways |     // since samplers names are written multiple ways | ||||||
|     // make it ready for both system names and input names |     // make it ready for both system names and input names | ||||||
|     std::unordered_map<std::string, llama_sampler_type> sampler_name_map { |     std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map { | ||||||
|         {"top_k",       llama_sampler_type::TOP_K}, |  | ||||||
|         {"top-k",       llama_sampler_type::TOP_K}, |         {"top-k",       llama_sampler_type::TOP_K}, | ||||||
|         {"top_p",       llama_sampler_type::TOP_P}, |  | ||||||
|         {"top-p",       llama_sampler_type::TOP_P}, |         {"top-p",       llama_sampler_type::TOP_P}, | ||||||
|         {"nucleus",     llama_sampler_type::TOP_P}, |         {"nucleus",     llama_sampler_type::TOP_P}, | ||||||
|         {"typical_p",   llama_sampler_type::TYPICAL_P}, |  | ||||||
|         {"typical-p",   llama_sampler_type::TYPICAL_P}, |         {"typical-p",   llama_sampler_type::TYPICAL_P}, | ||||||
|         {"typical",     llama_sampler_type::TYPICAL_P}, |         {"typical",     llama_sampler_type::TYPICAL_P}, | ||||||
|         {"min_p",       llama_sampler_type::MIN_P}, |  | ||||||
|         {"min-p",       llama_sampler_type::MIN_P}, |         {"min-p",       llama_sampler_type::MIN_P}, | ||||||
|         {"tfs_z",       llama_sampler_type::TFS_Z}, |  | ||||||
|         {"tfs-z",       llama_sampler_type::TFS_Z}, |         {"tfs-z",       llama_sampler_type::TFS_Z}, | ||||||
|         {"tfs",         llama_sampler_type::TFS_Z}, |         {"tfs",         llama_sampler_type::TFS_Z}, | ||||||
|         {"temp",        llama_sampler_type::TEMP}, |         {"temp",        llama_sampler_type::TEMPERATURE} | ||||||
|         {"temperature", llama_sampler_type::TEMP} |  | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     std::vector<llama_sampler_type> sampler_types; |     std::vector<llama_sampler_type> sampler_types; | ||||||
|     sampler_types.reserve(names.size()); |     sampler_types.reserve(names.size()); | ||||||
|     for (const auto& name : names) { |     for (const auto & name : names) | ||||||
|         const auto sampler_item = sampler_name_map.find(name); |     { | ||||||
|         if (sampler_item != sampler_name_map.end()) { |         auto sampler_item = sampler_canonical_name_map.find(name); | ||||||
|  |         if (sampler_item != sampler_canonical_name_map.end()) | ||||||
|  |         { | ||||||
|             sampler_types.push_back(sampler_item->second); |             sampler_types.push_back(sampler_item->second); | ||||||
|         } |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             if (allow_alt_names) | ||||||
|  |             { | ||||||
|  |                 sampler_item = sampler_alt_name_map.find(name); | ||||||
|  |                 if (sampler_item != sampler_alt_name_map.end()) | ||||||
|  |                 { | ||||||
|  |                     sampler_types.push_back(sampler_item->second); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|     return sampler_types; |     return sampler_types; | ||||||
| } | } | ||||||
| @@ -1172,7 +1189,7 @@ std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & nam | |||||||
|         {'y', llama_sampler_type::TYPICAL_P}, |         {'y', llama_sampler_type::TYPICAL_P}, | ||||||
|         {'m', llama_sampler_type::MIN_P}, |         {'m', llama_sampler_type::MIN_P}, | ||||||
|         {'f', llama_sampler_type::TFS_Z}, |         {'f', llama_sampler_type::TFS_Z}, | ||||||
|         {'t', llama_sampler_type::TEMP} |         {'t', llama_sampler_type::TEMPERATURE} | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     std::vector<llama_sampler_type> sampler_types; |     std::vector<llama_sampler_type> sampler_types; | ||||||
| @@ -1188,12 +1205,12 @@ std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & nam | |||||||
|  |  | ||||||
| std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { | std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { | ||||||
|     switch (sampler_type) { |     switch (sampler_type) { | ||||||
|         case llama_sampler_type::TOP_K:     return "top_k"; |         case llama_sampler_type::TOP_K:       return "top_k"; | ||||||
|         case llama_sampler_type::TFS_Z:     return "tfs_z"; |         case llama_sampler_type::TFS_Z:       return "tfs_z"; | ||||||
|         case llama_sampler_type::TYPICAL_P: return "typical_p"; |         case llama_sampler_type::TYPICAL_P:   return "typical_p"; | ||||||
|         case llama_sampler_type::TOP_P:     return "top_p"; |         case llama_sampler_type::TOP_P:       return "top_p"; | ||||||
|         case llama_sampler_type::MIN_P:     return "min_p"; |         case llama_sampler_type::MIN_P:       return "min_p"; | ||||||
|         case llama_sampler_type::TEMP:      return "temp"; |         case llama_sampler_type::TEMPERATURE: return "temperature"; | ||||||
|         default : return ""; |         default : return ""; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -165,7 +165,7 @@ void process_escapes(std::string& input); | |||||||
| // String utils | // String utils | ||||||
| // | // | ||||||
|  |  | ||||||
| std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names); | std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names); | ||||||
| std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string); | std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string); | ||||||
| std::vector<std::string> string_split(std::string input, char separator); | std::vector<std::string> string_split(std::string input, char separator); | ||||||
| std::string sampler_type_to_name_string(llama_sampler_type sampler_type); | std::string sampler_type_to_name_string(llama_sampler_type sampler_type); | ||||||
|   | |||||||
| @@ -139,7 +139,7 @@ static void sampler_queue( | |||||||
|             case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break; |             case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break; | ||||||
|             case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break; |             case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break; | ||||||
|             case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break; |             case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break; | ||||||
|             case llama_sampler_type::TEMP: |             case llama_sampler_type::TEMPERATURE: | ||||||
|                 if (dynatemp_range > 0) { |                 if (dynatemp_range > 0) { | ||||||
|                     float dynatemp_min = std::max(0.0f, temp - dynatemp_range); |                     float dynatemp_min = std::max(0.0f, temp - dynatemp_range); | ||||||
|                     float dynatemp_max = std::max(0.0f, temp + dynatemp_range); |                     float dynatemp_max = std::max(0.0f, temp + dynatemp_range); | ||||||
|   | |||||||
| @@ -10,12 +10,12 @@ | |||||||
|  |  | ||||||
| // sampler types | // sampler types | ||||||
| enum class llama_sampler_type : char { | enum class llama_sampler_type : char { | ||||||
|     TOP_K     = 'k', |     TOP_K       = 'k', | ||||||
|     TOP_P     = 'p', |     TOP_P       = 'p', | ||||||
|     MIN_P     = 'm', |     MIN_P       = 'm', | ||||||
|     TFS_Z     = 'f', |     TFS_Z       = 'f', | ||||||
|     TYPICAL_P = 'y', |     TYPICAL_P   = 'y', | ||||||
|     TEMP      = 't' |     TEMPERATURE = 't' | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // sampling parameters | // sampling parameters | ||||||
| @@ -45,7 +45,7 @@ typedef struct llama_sampling_params { | |||||||
|         llama_sampler_type::TYPICAL_P, |         llama_sampler_type::TYPICAL_P, | ||||||
|         llama_sampler_type::TOP_P, |         llama_sampler_type::TOP_P, | ||||||
|         llama_sampler_type::MIN_P, |         llama_sampler_type::MIN_P, | ||||||
|         llama_sampler_type::TEMP |         llama_sampler_type::TEMPERATURE | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     std::string grammar;  // optional BNF-like grammar to constrain sampling |     std::string grammar;  // optional BNF-like grammar to constrain sampling | ||||||
|   | |||||||
| @@ -204,6 +204,8 @@ node index.js | |||||||
|  |  | ||||||
|     `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) |     `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) | ||||||
|  |  | ||||||
|  |     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values) | ||||||
|  |  | ||||||
| ### Result JSON | ### Result JSON | ||||||
|  |  | ||||||
| - Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. | - Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. | ||||||
|   | |||||||
| @@ -672,6 +672,24 @@ struct llama_server_context | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         const auto &samplers_sequence = data.find("samplers"); | ||||||
|  |         if (samplers_sequence != data.end() && samplers_sequence->is_array()) | ||||||
|  |         { | ||||||
|  |             std::vector<std::string> sampler_names; | ||||||
|  |             for (const auto &sampler_name : *samplers_sequence) | ||||||
|  |             { | ||||||
|  |                 if (sampler_name.is_string()) | ||||||
|  |                 { | ||||||
|  |                     sampler_names.emplace_back(sampler_name); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             slot->sparams.samplers_sequence = default_sparams.samplers_sequence; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         if (multimodal) |         if (multimodal) | ||||||
|         { |         { | ||||||
|             const auto &images_data = data.find("image_data"); |             const auto &images_data = data.find("image_data"); | ||||||
| @@ -1026,6 +1044,12 @@ struct llama_server_context | |||||||
|         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); |         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); | ||||||
|         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && |         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && | ||||||
|                                 eos_bias->second < 0.0f && std::isinf(eos_bias->second); |                                 eos_bias->second < 0.0f && std::isinf(eos_bias->second); | ||||||
|  |         std::vector<std::string> samplers_sequence; | ||||||
|  |         for (const auto &sampler_type : slot.sparams.samplers_sequence) | ||||||
|  |         { | ||||||
|  |             samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         return json { |         return json { | ||||||
|             {"n_ctx",             slot.n_ctx}, |             {"n_ctx",             slot.n_ctx}, | ||||||
|             {"model",             params.model_alias}, |             {"model",             params.model_alias}, | ||||||
| @@ -1056,6 +1080,7 @@ struct llama_server_context | |||||||
|             {"logit_bias",        slot.sparams.logit_bias}, |             {"logit_bias",        slot.sparams.logit_bias}, | ||||||
|             {"n_probs",           slot.sparams.n_probs}, |             {"n_probs",           slot.sparams.n_probs}, | ||||||
|             {"grammar",           slot.sparams.grammar}, |             {"grammar",           slot.sparams.grammar}, | ||||||
|  |             {"samplers",          samplers_sequence} | ||||||
|         }; |         }; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Alexey Parfenov
					Alexey Parfenov