mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
llama : add support for qwen3 reranker (#15824)
This commit is contained in:
@@ -5093,21 +5093,15 @@ int main(int argc, char ** argv) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
|
||||
if (tokenized_queries.size() != 1) {
|
||||
res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
|
||||
tasks.reserve(tokenized_docs.size());
|
||||
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
|
||||
tasks.reserve(documents.size());
|
||||
for (size_t i = 0; i < documents.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
|
||||
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
|
||||
@@ -1368,34 +1368,6 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
|
||||
return std::to_string(hash);
|
||||
}
|
||||
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
static server_tokens format_rerank(const struct llama_vocab * vocab, server_tokens & query, server_tokens & doc) {
|
||||
server_tokens result = {};
|
||||
|
||||
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||
eos_token = llama_vocab_sep(vocab);
|
||||
}
|
||||
if (llama_vocab_get_add_bos(vocab)) {
|
||||
result.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
result.push_back(query);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
if (llama_vocab_get_add_sep(vocab)) {
|
||||
result.push_back(llama_vocab_sep(vocab));
|
||||
}
|
||||
result.push_back(doc);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
|
||||
mtmd::bitmaps bitmaps;
|
||||
for (auto & file : files) {
|
||||
@@ -1501,3 +1473,43 @@ static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * voc
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
static server_tokens format_rerank(const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc) {
|
||||
server_tokens result = {};
|
||||
|
||||
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
|
||||
|
||||
if (rerank_prompt != nullptr) {
|
||||
std::string prompt = rerank_prompt;
|
||||
string_replace_all(prompt, "{query}" , query);
|
||||
string_replace_all(prompt, "{document}", doc );
|
||||
server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true);
|
||||
result.push_back(tokens);
|
||||
} else {
|
||||
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||
server_tokens query_tokens = tokenize_input_subprompt(vocab, mctx, query, false, false);
|
||||
server_tokens doc_tokens = tokenize_input_subprompt(vocab, mctx, doc, false, false);
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||
eos_token = llama_vocab_sep(vocab);
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_bos(vocab)) {
|
||||
result.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
result.push_back(query_tokens);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
if (llama_vocab_get_add_sep(vocab)) {
|
||||
result.push_back(llama_vocab_sep(vocab));
|
||||
}
|
||||
result.push_back(doc_tokens);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user