mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	cli: add EOT when user hit Ctrl+C (#8296)
* main: add need_insert_eot * do not format system prompt if it is empty
This commit is contained in:
		| @@ -1394,7 +1394,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|     options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); | ||||
|     options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); | ||||
|     options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); | ||||
|     options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() }); | ||||
|     options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n" | ||||
|                                                                         "in conversation mode, this will be used as system prompt\n" | ||||
|                                                                         "(default: '%s')", params.prompt.c_str() }); | ||||
|     options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" }); | ||||
|     options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" }); | ||||
|     options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" }); | ||||
| @@ -1409,7 +1411,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|                                                                         "halt generation at PROMPT, return control in interactive mode\n" | ||||
|                                                                         "can be specified more than once for multiple prompts" }); | ||||
|     options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); | ||||
|     options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" }); | ||||
|     options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode, does not print special tokens and suffix/prefix\n" | ||||
|                                                                         "if suffix/prefix are not specified, default chat template will be used\n" | ||||
|                                                                         "(default: %s)", params.conversation ? "true" : "false" }); | ||||
|     options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); | ||||
|     options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); | ||||
|     options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" }); | ||||
| @@ -1453,6 +1457,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|     options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); | ||||
|     options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE", | ||||
|                                                                         "set custom jinja chat template (default: template taken from model's metadata)\n" | ||||
|                                                                         "if suffix/prefix are specified, template will be disabled\n" | ||||
|                                                                         "only commonly used templates are accepted:\n" | ||||
|                                                                         "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); | ||||
|     options.push_back({ "grammar" }); | ||||
|   | ||||
| @@ -37,7 +37,8 @@ static gpt_params               * g_params; | ||||
| static std::vector<llama_token> * g_input_tokens; | ||||
| static std::ostringstream       * g_output_ss; | ||||
| static std::vector<llama_token> * g_output_tokens; | ||||
| static bool is_interacting = false; | ||||
| static bool is_interacting  = false; | ||||
| static bool need_insert_eot = false; | ||||
|  | ||||
| static bool file_exists(const std::string & path) { | ||||
|     std::ifstream f(path.c_str()); | ||||
| @@ -99,7 +100,8 @@ static void write_logfile( | ||||
| static void sigint_handler(int signo) { | ||||
|     if (signo == SIGINT) { | ||||
|         if (!is_interacting && g_params->interactive) { | ||||
|             is_interacting = true; | ||||
|             is_interacting  = true; | ||||
|             need_insert_eot = true; | ||||
|         } else { | ||||
|             console::cleanup(); | ||||
|             printf("\n"); | ||||
| @@ -224,7 +226,14 @@ int main(int argc, char ** argv) { | ||||
|                 __func__, n_ctx_train, n_ctx); | ||||
|     } | ||||
|  | ||||
|     LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); | ||||
|     // print chat template example in conversation mode | ||||
|     if (params.conversation) { | ||||
|         if (params.enable_chat_template) { | ||||
|             LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); | ||||
|         } else { | ||||
|             LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // print system information | ||||
|     { | ||||
| @@ -263,7 +272,7 @@ int main(int argc, char ** argv) { | ||||
|     std::vector<llama_token> embd_inp; | ||||
|  | ||||
|     { | ||||
|         auto prompt = (params.conversation && params.enable_chat_template) | ||||
|         auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) | ||||
|             ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode | ||||
|             : params.prompt; | ||||
|         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { | ||||
| @@ -905,6 +914,13 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); | ||||
|  | ||||
|                     // if user stop generation mid-way, we must add EOT to finish model's last response | ||||
|                     if (need_insert_eot && format_chat) { | ||||
|                         llama_token eot = llama_token_eot(model); | ||||
|                         embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); | ||||
|                         need_insert_eot = false; | ||||
|                     } | ||||
|  | ||||
|                     embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); | ||||
|                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); | ||||
|                     embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen