mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	allow server to multithread
because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked.
This commit is contained in:
		@@ -2,8 +2,6 @@
 | 
			
		||||
#include "llama.h"
 | 
			
		||||
#include "build-info.h"
 | 
			
		||||
 | 
			
		||||
// single thread
 | 
			
		||||
#define CPPHTTPLIB_THREAD_POOL_COUNT 1
 | 
			
		||||
#ifndef NDEBUG
 | 
			
		||||
// crash the server in debug mode, otherwise send an http 500 error
 | 
			
		||||
#define CPPHTTPLIB_NO_EXCEPTIONS 1
 | 
			
		||||
@@ -34,7 +32,6 @@ struct server_params {
 | 
			
		||||
    int32_t port = 8080;
 | 
			
		||||
    int32_t read_timeout = 600;
 | 
			
		||||
    int32_t write_timeout = 600;
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// completion token output with probabilities
 | 
			
		||||
@@ -183,6 +180,12 @@ struct llama_server_context {
 | 
			
		||||
    std::string stopping_word;
 | 
			
		||||
    int32_t multibyte_pending = 0;
 | 
			
		||||
 | 
			
		||||
    std::mutex mutex;
 | 
			
		||||
 | 
			
		||||
    std::unique_lock<std::mutex> lock() {
 | 
			
		||||
        return std::unique_lock<std::mutex>(mutex);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ~llama_server_context() {
 | 
			
		||||
        if (ctx) {
 | 
			
		||||
            llama_free(ctx);
 | 
			
		||||
@@ -912,6 +915,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    Server svr;
 | 
			
		||||
 | 
			
		||||
    svr.set_default_headers({
 | 
			
		||||
        { "Server", "llama.cpp" },
 | 
			
		||||
        { "Access-Control-Allow-Origin", "*" },
 | 
			
		||||
        { "Access-Control-Allow-Headers", "content-type" }
 | 
			
		||||
    });
 | 
			
		||||
@@ -929,7 +933,10 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    svr.Post("/completion", [&llama](const Request & req, Response & res) {
 | 
			
		||||
        auto lock = llama.lock();
 | 
			
		||||
 | 
			
		||||
        llama.rewind();
 | 
			
		||||
 | 
			
		||||
        llama_reset_timings(llama.ctx);
 | 
			
		||||
 | 
			
		||||
        parse_options_completion(json::parse(req.body), llama);
 | 
			
		||||
@@ -1038,6 +1045,8 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
 | 
			
		||||
        auto lock = llama.lock();
 | 
			
		||||
 | 
			
		||||
        const json body = json::parse(req.body);
 | 
			
		||||
        const std::string content = body.value("content", "");
 | 
			
		||||
        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
 | 
			
		||||
@@ -1046,6 +1055,8 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    svr.Post("/embedding", [&llama](const Request & req, Response & res) {
 | 
			
		||||
        auto lock = llama.lock();
 | 
			
		||||
 | 
			
		||||
        const json body = json::parse(req.body);
 | 
			
		||||
 | 
			
		||||
        llama.rewind();
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user