mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	Merge branch 'master' into gg/llama-kv-cache
ggml-ci
This commit is contained in:
		@@ -42,7 +42,7 @@ enum stop_type {
 | 
			
		||||
    STOP_TYPE_LIMIT,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
 | 
			
		||||
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
 | 
			
		||||
enum slot_state {
 | 
			
		||||
    SLOT_STATE_IDLE,
 | 
			
		||||
    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
 | 
			
		||||
@@ -173,6 +173,7 @@ struct slot_params {
 | 
			
		||||
            {"grammar_trigger_words",     grammar_trigger_words},
 | 
			
		||||
            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
 | 
			
		||||
            {"preserved_tokens",          sampling.preserved_tokens},
 | 
			
		||||
            {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
 | 
			
		||||
            {"samplers",                  samplers},
 | 
			
		||||
            {"speculative.n_max",         speculative.n_max},
 | 
			
		||||
            {"speculative.n_min",         speculative.n_min},
 | 
			
		||||
@@ -724,9 +725,19 @@ struct server_task_result_cmpl_final : server_task_result {
 | 
			
		||||
            msg.content = content;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        json tool_calls;
 | 
			
		||||
        json message {
 | 
			
		||||
            {"role", "assistant"},
 | 
			
		||||
        };
 | 
			
		||||
        if (!msg.reasoning_content.empty()) {
 | 
			
		||||
            message["reasoning_content"] = msg.reasoning_content;
 | 
			
		||||
        }
 | 
			
		||||
        if (msg.content.empty() && !msg.tool_calls.empty()) {
 | 
			
		||||
            message["content"] = json();
 | 
			
		||||
        } else {
 | 
			
		||||
            message["content"] = msg.content;
 | 
			
		||||
        }
 | 
			
		||||
        if (!msg.tool_calls.empty()) {
 | 
			
		||||
            tool_calls = json::array();
 | 
			
		||||
            auto tool_calls = json::array();
 | 
			
		||||
            for (const auto & tc : msg.tool_calls) {
 | 
			
		||||
                tool_calls.push_back({
 | 
			
		||||
                    {"type", "function"},
 | 
			
		||||
@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
 | 
			
		||||
                    {"id", tc.id},
 | 
			
		||||
                });
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        json message {
 | 
			
		||||
            {"content", msg.content},
 | 
			
		||||
            {"tool_calls", tool_calls},
 | 
			
		||||
            {"role", "assistant"},
 | 
			
		||||
        };
 | 
			
		||||
        if (!msg.tool_plan.empty()) {
 | 
			
		||||
            message["tool_plan"] = msg.tool_plan;
 | 
			
		||||
            message["tool_calls"] = tool_calls;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        json choice {
 | 
			
		||||
@@ -1600,6 +1603,10 @@ struct server_queue {
 | 
			
		||||
 | 
			
		||||
            while (true) {
 | 
			
		||||
                std::unique_lock<std::mutex> lock(mutex_tasks);
 | 
			
		||||
                if (!running) {
 | 
			
		||||
                    QUE_DBG("%s", "terminate\n");
 | 
			
		||||
                    return;
 | 
			
		||||
                }
 | 
			
		||||
                if (queue_tasks.empty()) {
 | 
			
		||||
                    lock.unlock();
 | 
			
		||||
                    break;
 | 
			
		||||
@@ -1620,11 +1627,11 @@ struct server_queue {
 | 
			
		||||
            QUE_DBG("%s", "waiting for new tasks\n");
 | 
			
		||||
            {
 | 
			
		||||
                std::unique_lock<std::mutex> lock(mutex_tasks);
 | 
			
		||||
                if (!running) {
 | 
			
		||||
                    QUE_DBG("%s", "terminate\n");
 | 
			
		||||
                    return;
 | 
			
		||||
                }
 | 
			
		||||
                if (queue_tasks.empty()) {
 | 
			
		||||
                    if (!running) {
 | 
			
		||||
                        QUE_DBG("%s", "terminate\n");
 | 
			
		||||
                        return;
 | 
			
		||||
                    }
 | 
			
		||||
                    condition_tasks.wait(lock, [&]{
 | 
			
		||||
                        return (!queue_tasks.empty() || !running);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -2069,8 +2076,8 @@ struct server_context {
 | 
			
		||||
 | 
			
		||||
        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
 | 
			
		||||
            // Might be better to reject the request with a 400 ?
 | 
			
		||||
            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
 | 
			
		||||
            slot.params.n_predict = slot.n_predict;
 | 
			
		||||
            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (slot.params.ignore_eos && has_eos_token) {
 | 
			
		||||
@@ -2275,7 +2282,7 @@ struct server_context {
 | 
			
		||||
            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
 | 
			
		||||
                result.probs.push_back({
 | 
			
		||||
                    cur_p->data[i].id,
 | 
			
		||||
                    common_detokenize(ctx, {cur_p->data[i].id}, special),
 | 
			
		||||
                    common_token_to_piece(ctx, cur_p->data[i].id, special),
 | 
			
		||||
                    cur_p->data[i].p
 | 
			
		||||
                });
 | 
			
		||||
            }
 | 
			
		||||
@@ -2297,7 +2304,7 @@ struct server_context {
 | 
			
		||||
            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
 | 
			
		||||
                result.probs.push_back({
 | 
			
		||||
                    cur[i].id,
 | 
			
		||||
                    common_detokenize(ctx, {cur[i].id}, special),
 | 
			
		||||
                    common_token_to_piece(ctx, cur[i].id, special),
 | 
			
		||||
                    cur[i].p
 | 
			
		||||
                });
 | 
			
		||||
            }
 | 
			
		||||
@@ -3649,7 +3656,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
            }, {
 | 
			
		||||
                    {"name",  "n_busy_slots_per_decode"},
 | 
			
		||||
                    {"help",  "Average number of busy slots per llama_decode() call"},
 | 
			
		||||
                    {"value",  (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
 | 
			
		||||
                    {"value",  (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
 | 
			
		||||
            }}},
 | 
			
		||||
            {"gauge", {{
 | 
			
		||||
                    {"name",  "prompt_tokens_seconds"},
 | 
			
		||||
@@ -4056,7 +4063,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        auto body = json::parse(req.body);
 | 
			
		||||
        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
 | 
			
		||||
        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
 | 
			
		||||
 | 
			
		||||
        return handle_completions_impl(
 | 
			
		||||
            SERVER_TASK_TYPE_COMPLETION,
 | 
			
		||||
@@ -4069,7 +4076,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    // same with handle_chat_completions, but without inference part
 | 
			
		||||
    const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
 | 
			
		||||
        auto body = json::parse(req.body);
 | 
			
		||||
        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
 | 
			
		||||
        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
 | 
			
		||||
        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
@@ -4430,6 +4437,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
 | 
			
		||||
    // clean up function, to be called before exit
 | 
			
		||||
    auto clean_up = [&svr]() {
 | 
			
		||||
        SRV_INF("%s: cleaning up before exit...\n", __func__);
 | 
			
		||||
        svr->stop();
 | 
			
		||||
        llama_backend_free();
 | 
			
		||||
    };
 | 
			
		||||
@@ -4446,10 +4454,6 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!was_bound) {
 | 
			
		||||
        //LOG_ERROR("couldn't bind HTTP server socket", {
 | 
			
		||||
        //    {"hostname", params.hostname},
 | 
			
		||||
        //    {"port", params.port},
 | 
			
		||||
        //});
 | 
			
		||||
        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
 | 
			
		||||
        clean_up();
 | 
			
		||||
        return 1;
 | 
			
		||||
@@ -4466,7 +4470,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
 | 
			
		||||
    if (!ctx_server.load_model(params)) {
 | 
			
		||||
        clean_up();
 | 
			
		||||
        t.join();
 | 
			
		||||
        // t.join(); // FIXME: see below
 | 
			
		||||
        LOG_ERR("%s: exiting due to model loading error\n", __func__);
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
@@ -4490,13 +4494,10 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    shutdown_handler = [&](int) {
 | 
			
		||||
        // this will unblock start_loop()
 | 
			
		||||
        ctx_server.queue_tasks.terminate();
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
 | 
			
		||||
 | 
			
		||||
    ctx_server.queue_tasks.start_loop();
 | 
			
		||||
 | 
			
		||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 | 
			
		||||
    struct sigaction sigint_action;
 | 
			
		||||
    sigint_action.sa_handler = signal_handler;
 | 
			
		||||
@@ -4511,8 +4512,13 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
 | 
			
		||||
 | 
			
		||||
    // this call blocks the main thread until queue_tasks.terminate() is called
 | 
			
		||||
    ctx_server.queue_tasks.start_loop();
 | 
			
		||||
 | 
			
		||||
    clean_up();
 | 
			
		||||
    t.join();
 | 
			
		||||
    // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user