mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : fix system prompt cli (#5516)
This commit is contained in:
		| @@ -436,10 +436,6 @@ struct llama_server_context | |||||||
|         default_generation_settings_for_props["seed"] = -1; |         default_generation_settings_for_props["seed"] = -1; | ||||||
|  |  | ||||||
|         batch = llama_batch_init(n_ctx, 0, params.n_parallel); |         batch = llama_batch_init(n_ctx, 0, params.n_parallel); | ||||||
|  |  | ||||||
|         // empty system prompt |  | ||||||
|         system_prompt = ""; |  | ||||||
|         system_tokens.clear(); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const |     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const | ||||||
| @@ -765,27 +761,30 @@ struct llama_server_context | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     void update_system_prompt() { |     void update_system_prompt() { | ||||||
|         system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); |  | ||||||
|  |  | ||||||
|         llama_batch_clear(batch); |  | ||||||
|  |  | ||||||
|         kv_cache_clear(); |         kv_cache_clear(); | ||||||
|  |         system_tokens.clear(); | ||||||
|  |  | ||||||
|         for (int i = 0; i < (int) system_tokens.size(); ++i) |         if (!system_prompt.empty()) { | ||||||
|         { |             system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); | ||||||
|             llama_batch_add(batch, system_tokens[i], i, { 0 }, false); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if (llama_decode(ctx, batch) != 0) |             llama_batch_clear(batch); | ||||||
|         { |  | ||||||
|             LOG_TEE("%s: llama_decode() failed\n", __func__); |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // assign the system KV cache to all parallel sequences |             for (int i = 0; i < (int)system_tokens.size(); ++i) | ||||||
|         for (int32_t i = 1; i < params.n_parallel; ++i) |             { | ||||||
|         { |                 llama_batch_add(batch, system_tokens[i], i, { 0 }, false); | ||||||
|             llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); |             } | ||||||
|  |  | ||||||
|  |             if (llama_decode(ctx, batch) != 0) | ||||||
|  |             { | ||||||
|  |                 LOG_TEE("%s: llama_decode() failed\n", __func__); | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // assign the system KV cache to all parallel sequences | ||||||
|  |             for (int32_t i = 1; i < params.n_parallel; ++i) | ||||||
|  |             { | ||||||
|  |                 llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         LOG_TEE("system prompt updated\n"); |         LOG_TEE("system prompt updated\n"); | ||||||
| @@ -807,10 +806,8 @@ struct llama_server_context | |||||||
|         name_user      = sys_props.value("anti_prompt", ""); |         name_user      = sys_props.value("anti_prompt", ""); | ||||||
|         name_assistant = sys_props.value("assistant_name", ""); |         name_assistant = sys_props.value("assistant_name", ""); | ||||||
|  |  | ||||||
|         if (slots.size() > 0) |  | ||||||
|         { |         notify_system_prompt_changed(); | ||||||
|             notify_system_prompt_changed(); |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, |     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Rőczey Barnabás
					Rőczey Barnabás