mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-19 11:57:07 +00:00
server: split HTTP into its own interface (#17216)
* server: split HTTP into its own interface * move server-http and httplib to its own file * add the remaining endpoints * fix exception/error handling * renaming * missing header * fix missing windows header * fix error responses from http layer * fix slot save/restore handler * fix case where only one stream chunk is returned * add NOMINMAX * do not call sink.write on empty data * use safe_json_to_str for SSE * clean up * add some comments * improve usage of next() * bring back the "server is listening on" message * more generic handler * add req.headers * move the chat template print to init() * add req.path * cont : minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -9,8 +9,6 @@
|
||||
#include "mtmd-helper.h"
|
||||
#include "chat.h"
|
||||
|
||||
#include <cpp-httplib/httplib.h>
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
@@ -426,6 +424,10 @@ static std::string gen_tool_call_id() {
|
||||
// other common utils
|
||||
//
|
||||
|
||||
static std::string safe_json_to_str(const json & data) {
|
||||
return data.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
}
|
||||
|
||||
// TODO: reuse llama_detokenize
|
||||
template <class Iter>
|
||||
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||
@@ -453,29 +455,25 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
|
||||
return out;
|
||||
}
|
||||
|
||||
// format server-sent event (SSE), return the formatted string to send
|
||||
// note: if data is a json array, it will be sent as multiple events, one per item
|
||||
static bool server_sent_event(httplib::DataSink & sink, const json & data) {
|
||||
static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool {
|
||||
const std::string str =
|
||||
"data: " +
|
||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
static std::string format_sse(const json & data) {
|
||||
std::ostringstream ss;
|
||||
auto send_single = [&ss](const json & data) {
|
||||
ss << "data: " <<
|
||||
safe_json_to_str(data) <<
|
||||
"\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
|
||||
|
||||
LOG_DBG("data stream, to_send: %s", str.c_str());
|
||||
return sink.write(str.c_str(), str.size());
|
||||
};
|
||||
|
||||
if (data.is_array()) {
|
||||
for (const auto & item : data) {
|
||||
if (!send_single(sink, item)) {
|
||||
return false;
|
||||
}
|
||||
send_single(item);
|
||||
}
|
||||
} else {
|
||||
return send_single(sink, data);
|
||||
send_single(data);
|
||||
}
|
||||
|
||||
return true;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
//
|
||||
@@ -954,10 +952,6 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
|
||||
return data;
|
||||
}
|
||||
|
||||
static std::string safe_json_to_str(const json & data) {
|
||||
return data.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
}
|
||||
|
||||
static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
|
||||
std::vector<llama_token_data> cur;
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
Reference in New Issue
Block a user