server: split HTTP into its own interface (#17216)

* server: split HTTP into its own interface * move server-http and httplib to its own file * add the remaining endpoints * fix exception/error handling * renaming * missing header * fix missing windows header * fix error responses from http layer * fix slot save/restore handler * fix case where only one stream chunk is returned * add NOMINMAX * do not call sink.write on empty data * use safe_json_to_str for SSE * clean up * add some comments * improve usage of next() * bring back the "server is listening on" message * more generic handler * add req.headers * move the chat template print to init() * add req.path * cont : minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-11-19 11:57:07 +00:00 · 2025-11-17 22:05:44 +01:00
parent 38e2c1b412
commit 0de8878c96
5 changed files with 1245 additions and 930 deletions
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -9,8 +9,6 @@
 #include "mtmd-helper.h"
 #include "chat.h"

-#include <cpp-httplib/httplib.h>
-
 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>

@@ -426,6 +424,10 @@ static std::string gen_tool_call_id() {
 // other common utils
 //

+static std::string safe_json_to_str(const json & data) {
+    return data.dump(-1, ' ', false, json::error_handler_t::replace);
+}
+
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
@@ -453,29 +455,25 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
    return out;
 }

+// format server-sent event (SSE), return the formatted string to send
 // note: if data is a json array, it will be sent as multiple events, one per item
-static bool server_sent_event(httplib::DataSink & sink, const json & data) {
-    static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool {
-        const std::string str =
-            "data: " +
-            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+static std::string format_sse(const json & data) {
+    std::ostringstream ss;
+    auto send_single = [&ss](const json & data) {
+        ss << "data: " <<
+            safe_json_to_str(data) <<
            "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
-
-        LOG_DBG("data stream, to_send: %s", str.c_str());
-        return sink.write(str.c_str(), str.size());
    };

    if (data.is_array()) {
        for (const auto & item : data) {
-            if (!send_single(sink, item)) {
-                return false;
-            }
+            send_single(item);
        }
    } else {
-        return send_single(sink, data);
+        send_single(data);
    }

-    return true;
+    return ss.str();
 }

 //
@@ -954,10 +952,6 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
    return data;
 }

-static std::string safe_json_to_str(const json & data) {
-    return data.dump(-1, ' ', false, json::error_handler_t::replace);
-}
-
 static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
    std::vector<llama_token_data> cur;
    const auto * logits = llama_get_logits_ith(ctx, idx);