mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Merge branch 'master' into xsn/private_batch_api
This commit is contained in:
		| @@ -42,7 +42,7 @@ enum stop_type { | ||||
|     STOP_TYPE_LIMIT, | ||||
| }; | ||||
|  | ||||
| // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 | ||||
| // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 | ||||
| enum slot_state { | ||||
|     SLOT_STATE_IDLE, | ||||
|     SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future | ||||
| @@ -274,7 +274,7 @@ struct server_task { | ||||
|         params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); | ||||
|  | ||||
|         params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); | ||||
|         params.speculative.n_min = std::max(params.speculative.n_min, 2); | ||||
|         params.speculative.n_min = std::max(params.speculative.n_min, 0); | ||||
|         params.speculative.n_max = std::max(params.speculative.n_max, 0); | ||||
|  | ||||
|         // Use OpenAI API logprobs only if n_probs wasn't provided | ||||
| @@ -329,9 +329,6 @@ struct server_task { | ||||
|         } | ||||
|  | ||||
|         // process "json_schema" and "grammar" | ||||
|         if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { | ||||
|             throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); | ||||
|         } | ||||
|         if (data.contains("json_schema") && !data.contains("grammar")) { | ||||
|             try { | ||||
|                 auto schema                  = json_value(data, "json_schema", json::object()); | ||||
| @@ -1807,7 +1804,7 @@ struct server_context { | ||||
|     // Necessary similarity of prompt for slot selection | ||||
|     float slot_prompt_similarity = 0.0f; | ||||
|  | ||||
|     common_chat_templates chat_templates; | ||||
|     common_chat_templates_ptr chat_templates; | ||||
|  | ||||
|     ~server_context() { | ||||
|         // Clear any sampling context | ||||
| @@ -1887,45 +1884,17 @@ struct server_context { | ||||
|             llama_init_dft.context.reset(); | ||||
|         } | ||||
|  | ||||
|         if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) { | ||||
|         chat_templates = common_chat_templates_init(model, params_base.chat_template); | ||||
|         try { | ||||
|             common_chat_format_example(chat_templates.get(), params.use_jinja); | ||||
|         } catch (const std::exception & e) { | ||||
|             SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); | ||||
|             chat_templates = common_chat_templates_from_model(model, "chatml"); | ||||
|         } else { | ||||
|             chat_templates = common_chat_templates_from_model(model, params_base.chat_template); | ||||
|             chat_templates = common_chat_templates_init(model, "chatml"); | ||||
|         } | ||||
|         GGML_ASSERT(chat_templates.template_default.get() != nullptr); | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     bool validate_builtin_chat_template(bool use_jinja) const { | ||||
|         llama_chat_message chat[] = {{"user", "test"}}; | ||||
|  | ||||
|         if (use_jinja) { | ||||
|             auto templates = common_chat_templates_from_model(model, ""); | ||||
|             common_chat_inputs inputs; | ||||
|             inputs.messages = json::array({{ | ||||
|                 {"role", "user"}, | ||||
|                 {"content", "test"}, | ||||
|             }}); | ||||
|             GGML_ASSERT(templates.template_default); | ||||
|             try { | ||||
|                 common_chat_params_init(*templates.template_default, inputs); | ||||
|                 if (templates.template_tool_use) { | ||||
|                     common_chat_params_init(*templates.template_tool_use, inputs); | ||||
|                 } | ||||
|                 return true; | ||||
|             } catch (const std::exception & e) { | ||||
|                 SRV_ERR("failed to apply template: %s\n", e.what()); | ||||
|                 return false; | ||||
|             } | ||||
|         } else { | ||||
|             const char * tmpl = llama_model_chat_template(model, /* name */ nullptr); | ||||
|             const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); | ||||
|             return chat_res > 0; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void init() { | ||||
|         const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; | ||||
|  | ||||
| @@ -3647,7 +3616,7 @@ int main(int argc, char ** argv) { | ||||
|             }, { | ||||
|                     {"name",  "n_busy_slots_per_decode"}, | ||||
|                     {"help",  "Average number of busy slots per llama_decode() call"}, | ||||
|                     {"value",  (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total} | ||||
|                     {"value",  (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)} | ||||
|             }}}, | ||||
|             {"gauge", {{ | ||||
|                     {"name",  "prompt_tokens_seconds"}, | ||||
| @@ -3813,13 +3782,15 @@ int main(int argc, char ** argv) { | ||||
|             { "default_generation_settings", ctx_server.default_generation_settings_for_props }, | ||||
|             { "total_slots",                 ctx_server.params_base.n_parallel }, | ||||
|             { "model_path",                  ctx_server.params_base.model }, | ||||
|             { "chat_template",               ctx_server.chat_templates.template_default->source() }, | ||||
|             { "bos_token",                   ctx_server.chat_templates.template_default->bos_token() }, | ||||
|             { "eos_token",                   ctx_server.chat_templates.template_default->eos_token() }, | ||||
|             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) }, | ||||
|             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, | ||||
|             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, | ||||
|             { "build_info",                  build_info }, | ||||
|         }; | ||||
|         if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) { | ||||
|             data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source(); | ||||
|         if (ctx_server.params_base.use_jinja) { | ||||
|             if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { | ||||
|                 data["chat_template_tool_use"] = tool_use_src; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         res_ok(res, data); | ||||
| @@ -4054,7 +4025,7 @@ int main(int argc, char ** argv) { | ||||
|         } | ||||
|  | ||||
|         auto body = json::parse(req.body); | ||||
|         json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates); | ||||
|         json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get()); | ||||
|  | ||||
|         return handle_completions_impl( | ||||
|             SERVER_TASK_TYPE_COMPLETION, | ||||
| @@ -4067,7 +4038,7 @@ int main(int argc, char ** argv) { | ||||
|     // same with handle_chat_completions, but without inference part | ||||
|     const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { | ||||
|         auto body = json::parse(req.body); | ||||
|         json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates); | ||||
|         json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get()); | ||||
|         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); | ||||
|     }; | ||||
|  | ||||
| @@ -4254,6 +4225,11 @@ int main(int argc, char ** argv) { | ||||
|         //    return; | ||||
|         //} | ||||
|  | ||||
|         // if true, use TEI API format, otherwise use Jina API format | ||||
|         // Jina: https://jina.ai/reranker/ | ||||
|         // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank | ||||
|         bool is_tei_format = body.contains("texts"); | ||||
|  | ||||
|         json query; | ||||
|         if (body.count("query") == 1) { | ||||
|             query = body.at("query"); | ||||
| @@ -4266,7 +4242,8 @@ int main(int argc, char ** argv) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         std::vector<std::string> documents = json_value(body, "documents", std::vector<std::string>()); | ||||
|         std::vector<std::string> documents = json_value(body, "documents", | ||||
|                                              json_value(body, "texts", std::vector<std::string>())); | ||||
|         if (documents.empty()) { | ||||
|             res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); | ||||
|             return; | ||||
| @@ -4311,7 +4288,12 @@ int main(int argc, char ** argv) { | ||||
|         } | ||||
|  | ||||
|         // write JSON response | ||||
|         json root = format_response_rerank(body, responses); | ||||
|         json root = format_response_rerank( | ||||
|             body, | ||||
|             responses, | ||||
|             is_tei_format, | ||||
|             documents); | ||||
|  | ||||
|         res_ok(res, root); | ||||
|     }; | ||||
|  | ||||
| @@ -4473,8 +4455,8 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|     // print sample chat example to make it clear which template is used | ||||
|     LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, | ||||
|         ctx_server.chat_templates.template_default->source().c_str(), | ||||
|         common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str()); | ||||
|         common_chat_templates_source(ctx_server.chat_templates.get()), | ||||
|         common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str()); | ||||
|  | ||||
|     ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) { | ||||
|         ctx_server.process_single_task(task); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen