mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	server : --n-predict option document and cap to max value (#5549)
* server: document --n-predict * server: ensure client request cannot override n_predict if set * server: fix print usage LF in new --n-predict option
This commit is contained in:
		| @@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 | |||||||
| - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. | - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. | ||||||
| - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` | - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` | ||||||
| - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` | - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` | ||||||
|  | - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) | ||||||
|  |  | ||||||
| ## Build | ## Build | ||||||
|  |  | ||||||
|   | |||||||
| @@ -159,6 +159,7 @@ struct llama_client_slot | |||||||
|     int32_t n_decoded   = 0; |     int32_t n_decoded   = 0; | ||||||
|     int32_t n_remaining = -1; |     int32_t n_remaining = -1; | ||||||
|     int32_t i_batch     = -1; |     int32_t i_batch     = -1; | ||||||
|  |     int32_t n_predict   = -1; | ||||||
|  |  | ||||||
|     int32_t num_prompt_tokens           = 0; |     int32_t num_prompt_tokens           = 0; | ||||||
|     int32_t num_prompt_tokens_processed = 0; |     int32_t num_prompt_tokens_processed = 0; | ||||||
| @@ -410,6 +411,7 @@ struct llama_server_context | |||||||
|  |  | ||||||
|             slot.id = i; |             slot.id = i; | ||||||
|             slot.n_ctx = n_ctx_slot; |             slot.n_ctx = n_ctx_slot; | ||||||
|  |             slot.n_predict = params.n_predict; | ||||||
|  |  | ||||||
|             LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); |             LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); | ||||||
|  |  | ||||||
| @@ -546,6 +548,15 @@ struct llama_server_context | |||||||
|         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar); |         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar); | ||||||
|         slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs); |         slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs); | ||||||
|  |  | ||||||
|  |         if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { | ||||||
|  |             // Might be better to reject the request with a 400 ? | ||||||
|  |             LOG_WARNING("Max tokens to predict exceeds server configuration", { | ||||||
|  |                 {"params.n_predict", slot->params.n_predict}, | ||||||
|  |                 {"slot.n_predict", slot->n_predict}, | ||||||
|  |             }); | ||||||
|  |             slot->params.n_predict = slot->n_predict; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         // infill |         // infill | ||||||
|         if (data.count("input_prefix") != 0) |         if (data.count("input_prefix") != 0) | ||||||
|         { |         { | ||||||
| @@ -1053,6 +1064,7 @@ struct llama_server_context | |||||||
|  |  | ||||||
|         return json { |         return json { | ||||||
|             {"n_ctx",             slot.n_ctx}, |             {"n_ctx",             slot.n_ctx}, | ||||||
|  |             {"n_predict",         slot.n_predict}, | ||||||
|             {"model",             params.model_alias}, |             {"model",             params.model_alias}, | ||||||
|             {"seed",              slot.params.seed}, |             {"seed",              slot.params.seed}, | ||||||
|             {"temperature",       slot.sparams.temp}, |             {"temperature",       slot.sparams.temp}, | ||||||
| @@ -1915,13 +1927,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, | |||||||
|     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n"); |     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n"); | ||||||
|     printf("  --log-disable             disables logging to a file.\n"); |     printf("  --log-disable             disables logging to a file.\n"); | ||||||
|     printf("\n"); |     printf("\n"); | ||||||
|  |     printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict); | ||||||
|     printf("  --override-kv KEY=TYPE:VALUE\n"); |     printf("  --override-kv KEY=TYPE:VALUE\n"); | ||||||
|     printf("                            advanced option to override model metadata by key. may be specified multiple times.\n"); |     printf("                            advanced option to override model metadata by key. may be specified multiple times.\n"); | ||||||
|     printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); |     printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); | ||||||
|     printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); |     printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); | ||||||
|     printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); |     printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); | ||||||
|     printf("  --chat-template FORMAT_NAME"); |     printf("  --chat-template FORMAT_NAME"); | ||||||
|     printf("                            set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); |     printf("                            set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str()); | ||||||
|     printf("\n"); |     printf("\n"); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Pierrick Hymbert
					Pierrick Hymbert