mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
server : Support multimodal completion and embeddings prompts in JSON format (#15108)
- Use server_tokens in more places in server and util.cpp - Convert most functions that used llama_tokens to server_tokens - Modify input tokenizer to handle JSON objects as subprompts - Break out MTMD prompt parsing into utility function - Support JSON objects with multimodal_data arrays for MTMD prompts along with other existing types - Add capability to model endpoint to indicate if client can send multimodal data - Add tests.
This commit is contained in:
@@ -4309,6 +4309,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
bool has_mtmd = ctx_server.mctx != nullptr;
|
||||
json data = {
|
||||
{
|
||||
"template", common_chat_templates_source(ctx_server.chat_templates.get()),
|
||||
@@ -4330,7 +4331,7 @@ int main(int argc, char ** argv) {
|
||||
{"quantization_level", ""}
|
||||
}},
|
||||
{"model_info", ""},
|
||||
{"capabilities", {"completion"}}
|
||||
{"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
|
||||
};
|
||||
|
||||
res_ok(res, data);
|
||||
@@ -4356,56 +4357,15 @@ int main(int argc, char ** argv) {
|
||||
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
||||
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
||||
|
||||
// process files
|
||||
mtmd::bitmaps bitmaps;
|
||||
const bool has_mtmd = ctx_server.mctx != nullptr;
|
||||
{
|
||||
if (!has_mtmd && !files.empty()) {
|
||||
throw std::runtime_error("This server does not support multimodal");
|
||||
}
|
||||
for (auto & file : files) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
|
||||
if (!bmp.ptr) {
|
||||
throw std::runtime_error("Failed to load image or audio file");
|
||||
}
|
||||
// calculate bitmap hash (for KV caching)
|
||||
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
|
||||
bmp.set_id(hash.c_str());
|
||||
bitmaps.entries.push_back(std::move(bmp));
|
||||
}
|
||||
}
|
||||
|
||||
// process prompt
|
||||
std::vector<server_tokens> inputs;
|
||||
|
||||
if (oaicompat && has_mtmd) {
|
||||
// multimodal
|
||||
std::string prompt_str = prompt.get<std::string>();
|
||||
mtmd_input_text inp_txt = {
|
||||
prompt_str.c_str(),
|
||||
/* add_special */ true,
|
||||
/* parse_special */ true,
|
||||
};
|
||||
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
||||
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
||||
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
|
||||
chunks.ptr.get(),
|
||||
&inp_txt,
|
||||
bitmaps_c_ptr.data(),
|
||||
bitmaps_c_ptr.size());
|
||||
if (tokenized != 0) {
|
||||
throw std::runtime_error("Failed to tokenize prompt");
|
||||
}
|
||||
|
||||
server_tokens tmp(chunks, true);
|
||||
inputs.push_back(std::move(tmp));
|
||||
if (oaicompat && ctx_server.mctx != nullptr) {
|
||||
// This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
|
||||
inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
|
||||
} else {
|
||||
// non-multimodal version
|
||||
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
||||
for (auto & p : tokenized_prompts) {
|
||||
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
|
||||
inputs.push_back(std::move(tmp));
|
||||
}
|
||||
// Everything else, including multimodal completions.
|
||||
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
||||
}
|
||||
|
||||
tasks.reserve(inputs.size());
|
||||
@@ -4574,7 +4534,7 @@ int main(int argc, char ** argv) {
|
||||
data["input_extra"] = input_extra; // default to empty array if it's not exist
|
||||
|
||||
std::string prompt = json_value(data, "prompt", std::string());
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
|
||||
std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
|
||||
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
|
||||
data["prompt"] = format_infill(
|
||||
ctx_server.vocab,
|
||||
@@ -4585,7 +4545,7 @@ int main(int argc, char ** argv) {
|
||||
ctx_server.params_base.n_predict,
|
||||
ctx_server.slots[0].n_ctx, // TODO: there should be a better way
|
||||
ctx_server.params_base.spm_infill,
|
||||
tokenized_prompts[0]
|
||||
tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
|
||||
);
|
||||
|
||||
std::vector<raw_buffer> files; // dummy
|
||||
@@ -4634,7 +4594,7 @@ int main(int argc, char ** argv) {
|
||||
if (current_state == SERVER_STATE_READY) {
|
||||
model_meta = ctx_server.model_meta();
|
||||
}
|
||||
|
||||
bool has_mtmd = ctx_server.mctx != nullptr;
|
||||
json models = {
|
||||
{"models", {
|
||||
{
|
||||
@@ -4646,7 +4606,7 @@ int main(int argc, char ** argv) {
|
||||
{"type", "model"},
|
||||
{"description", ""},
|
||||
{"tags", {""}},
|
||||
{"capabilities", {"completion"}},
|
||||
{"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
|
||||
{"parameters", ""},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
@@ -4763,7 +4723,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
|
||||
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
||||
for (const auto & tokens : tokenized_prompts) {
|
||||
// this check is necessary for models that do not add BOS token to the input
|
||||
if (tokens.empty()) {
|
||||
@@ -4791,7 +4751,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
|
||||
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
||||
|
||||
// OAI-compat
|
||||
task.params.oaicompat = oaicompat;
|
||||
@@ -4878,7 +4838,10 @@ int main(int argc, char ** argv) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
|
||||
std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
|
||||
if (tokenized_queries.size() != 1) {
|
||||
res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
@@ -4886,14 +4849,14 @@ int main(int argc, char ** argv) {
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
|
||||
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
|
||||
tasks.reserve(tokenized_docs.size());
|
||||
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
|
||||
auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
|
||||
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
|
||||
task.prompt_tokens = std::move(tmp);
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user