server: split HTTP into its own interface (#17216)

* server: split HTTP into its own interface

* move server-http and httplib to its own file

* add the remaining endpoints

* fix exception/error handling

* renaming

* missing header

* fix missing windows header

* fix error responses from http layer

* fix slot save/restore handler

* fix case where only one stream chunk is returned

* add NOMINMAX

* do not call sink.write on empty data

* use safe_json_to_str for SSE

* clean up

* add some comments

* improve usage of next()

* bring back the "server is listening on" message

* more generic handler

* add req.headers

* move the chat template print to init()

* add req.path

* cont : minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Xuan-Son Nguyen
2025-11-17 22:05:44 +01:00
committed by GitHub
parent 38e2c1b412
commit 0de8878c96
5 changed files with 1245 additions and 930 deletions

View File

@@ -9,8 +9,6 @@
#include "mtmd-helper.h"
#include "chat.h"
#include <cpp-httplib/httplib.h>
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
@@ -426,6 +424,10 @@ static std::string gen_tool_call_id() {
// other common utils
//
static std::string safe_json_to_str(const json & data) {
return data.dump(-1, ' ', false, json::error_handler_t::replace);
}
// TODO: reuse llama_detokenize
template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
@@ -453,29 +455,25 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
return out;
}
// format server-sent event (SSE), return the formatted string to send
// note: if data is a json array, it will be sent as multiple events, one per item
static bool server_sent_event(httplib::DataSink & sink, const json & data) {
static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool {
const std::string str =
"data: " +
data.dump(-1, ' ', false, json::error_handler_t::replace) +
static std::string format_sse(const json & data) {
std::ostringstream ss;
auto send_single = [&ss](const json & data) {
ss << "data: " <<
safe_json_to_str(data) <<
"\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
LOG_DBG("data stream, to_send: %s", str.c_str());
return sink.write(str.c_str(), str.size());
};
if (data.is_array()) {
for (const auto & item : data) {
if (!send_single(sink, item)) {
return false;
}
send_single(item);
}
} else {
return send_single(sink, data);
send_single(data);
}
return true;
return ss.str();
}
//
@@ -954,10 +952,6 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
return data;
}
static std::string safe_json_to_str(const json & data) {
return data.dump(-1, ' ', false, json::error_handler_t::replace);
}
static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
std::vector<llama_token_data> cur;
const auto * logits = llama_get_logits_ith(ctx, idx);