server : Support multimodal completion and embeddings prompts in JSON format (#15108)

- Use server_tokens in more places in server and util.cpp
- Convert most functions that used llama_tokens to server_tokens
- Modify input tokenizer to handle JSON objects as subprompts
- Break out MTMD prompt parsing into utility function
- Support JSON objects with multimodal_data arrays for MTMD prompts along with other existing types
- Add capability to model endpoint to indicate if client can send multimodal data
- Add tests.
This commit is contained in:
65a
2025-08-22 08:10:14 +00:00
committed by GitHub
parent e288693669
commit 4afb0a746f
5 changed files with 323 additions and 138 deletions

View File

@@ -4309,6 +4309,7 @@ int main(int argc, char ** argv) {
};
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
bool has_mtmd = ctx_server.mctx != nullptr;
json data = {
{
"template", common_chat_templates_source(ctx_server.chat_templates.get()),
@@ -4330,7 +4331,7 @@ int main(int argc, char ** argv) {
{"quantization_level", ""}
}},
{"model_info", ""},
{"capabilities", {"completion"}}
{"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
};
res_ok(res, data);
@@ -4356,56 +4357,15 @@ int main(int argc, char ** argv) {
// TODO: this log can become very long, put it behind a flag or think about a more compact format
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
// process files
mtmd::bitmaps bitmaps;
const bool has_mtmd = ctx_server.mctx != nullptr;
{
if (!has_mtmd && !files.empty()) {
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}
// calculate bitmap hash (for KV caching)
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
}
}
// process prompt
std::vector<server_tokens> inputs;
if (oaicompat && has_mtmd) {
// multimodal
std::string prompt_str = prompt.get<std::string>();
mtmd_input_text inp_txt = {
prompt_str.c_str(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0) {
throw std::runtime_error("Failed to tokenize prompt");
}
server_tokens tmp(chunks, true);
inputs.push_back(std::move(tmp));
if (oaicompat && ctx_server.mctx != nullptr) {
// This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
} else {
// non-multimodal version
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
for (auto & p : tokenized_prompts) {
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
inputs.push_back(std::move(tmp));
}
// Everything else, including multimodal completions.
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
}
tasks.reserve(inputs.size());
@@ -4574,7 +4534,7 @@ int main(int argc, char ** argv) {
data["input_extra"] = input_extra; // default to empty array if it's not exist
std::string prompt = json_value(data, "prompt", std::string());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
data["prompt"] = format_infill(
ctx_server.vocab,
@@ -4585,7 +4545,7 @@ int main(int argc, char ** argv) {
ctx_server.params_base.n_predict,
ctx_server.slots[0].n_ctx, // TODO: there should be a better way
ctx_server.params_base.spm_infill,
tokenized_prompts[0]
tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
);
std::vector<raw_buffer> files; // dummy
@@ -4634,7 +4594,7 @@ int main(int argc, char ** argv) {
if (current_state == SERVER_STATE_READY) {
model_meta = ctx_server.model_meta();
}
bool has_mtmd = ctx_server.mctx != nullptr;
json models = {
{"models", {
{
@@ -4646,7 +4606,7 @@ int main(int argc, char ** argv) {
{"type", "model"},
{"description", ""},
{"tags", {""}},
{"capabilities", {"completion"}},
{"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
{"parameters", ""},
{"details", {
{"parent_model", ""},
@@ -4763,7 +4723,7 @@ int main(int argc, char ** argv) {
}
}
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input
if (tokens.empty()) {
@@ -4791,7 +4751,7 @@ int main(int argc, char ** argv) {
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
task.prompt_tokens = std::move(tokenized_prompts[i]);
// OAI-compat
task.params.oaicompat = oaicompat;
@@ -4878,7 +4838,10 @@ int main(int argc, char ** argv) {
return;
}
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
if (tokenized_queries.size() != 1) {
res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
}
// create and queue the task
json responses = json::array();
@@ -4886,14 +4849,14 @@ int main(int argc, char ** argv) {
std::unordered_set<int> task_ids;
{
std::vector<server_task> tasks;
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
tasks.reserve(tokenized_docs.size());
for (size_t i = 0; i < tokenized_docs.size(); i++) {
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
task.prompt_tokens = std::move(tmp);
tasks.push_back(std::move(task));
}