mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : --n-predict option document and cap to max value (#5549)
* server: document --n-predict * server: ensure client request cannot override n_predict if set * server: fix print usage LF in new --n-predict option
This commit is contained in:
		| @@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 | ||||
| - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. | ||||
| - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` | ||||
| - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` | ||||
| - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) | ||||
|  | ||||
| ## Build | ||||
|  | ||||
|   | ||||
| @@ -159,6 +159,7 @@ struct llama_client_slot | ||||
|     int32_t n_decoded   = 0; | ||||
|     int32_t n_remaining = -1; | ||||
|     int32_t i_batch     = -1; | ||||
|     int32_t n_predict   = -1; | ||||
|  | ||||
|     int32_t num_prompt_tokens           = 0; | ||||
|     int32_t num_prompt_tokens_processed = 0; | ||||
| @@ -410,6 +411,7 @@ struct llama_server_context | ||||
|  | ||||
|             slot.id = i; | ||||
|             slot.n_ctx = n_ctx_slot; | ||||
|             slot.n_predict = params.n_predict; | ||||
|  | ||||
|             LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); | ||||
|  | ||||
| @@ -546,6 +548,15 @@ struct llama_server_context | ||||
|         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar); | ||||
|         slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs); | ||||
|  | ||||
|         if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { | ||||
|             // Might be better to reject the request with a 400 ? | ||||
|             LOG_WARNING("Max tokens to predict exceeds server configuration", { | ||||
|                 {"params.n_predict", slot->params.n_predict}, | ||||
|                 {"slot.n_predict", slot->n_predict}, | ||||
|             }); | ||||
|             slot->params.n_predict = slot->n_predict; | ||||
|         } | ||||
|  | ||||
|         // infill | ||||
|         if (data.count("input_prefix") != 0) | ||||
|         { | ||||
| @@ -1053,6 +1064,7 @@ struct llama_server_context | ||||
|  | ||||
|         return json { | ||||
|             {"n_ctx",             slot.n_ctx}, | ||||
|             {"n_predict",         slot.n_predict}, | ||||
|             {"model",             params.model_alias}, | ||||
|             {"seed",              slot.params.seed}, | ||||
|             {"temperature",       slot.sparams.temp}, | ||||
| @@ -1915,13 +1927,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||
|     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n"); | ||||
|     printf("  --log-disable             disables logging to a file.\n"); | ||||
|     printf("\n"); | ||||
|     printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict); | ||||
|     printf("  --override-kv KEY=TYPE:VALUE\n"); | ||||
|     printf("                            advanced option to override model metadata by key. may be specified multiple times.\n"); | ||||
|     printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); | ||||
|     printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); | ||||
|     printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); | ||||
|     printf("  --chat-template FORMAT_NAME"); | ||||
|     printf("                            set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); | ||||
|     printf("                            set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str()); | ||||
|     printf("\n"); | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Pierrick Hymbert
					Pierrick Hymbert