mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : enhanced health endpoint (#5548)
* server: enrich health endpoint with available slots, return 503 if not slots are available * server: document new status no slot available in the README.md
This commit is contained in:
		| @@ -136,6 +136,7 @@ node index.js | |||||||
|   - `{"status": "loading model"}` if the model is still being loaded. |   - `{"status": "loading model"}` if the model is still being loaded. | ||||||
|   - `{"status": "error"}` if the model failed to load. |   - `{"status": "error"}` if the model failed to load. | ||||||
|   - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. |   - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. | ||||||
|  |   - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available | ||||||
|  |  | ||||||
| - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. | - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2578,8 +2578,35 @@ int main(int argc, char **argv) | |||||||
|         server_state current_state = state.load(); |         server_state current_state = state.load(); | ||||||
|         switch(current_state) { |         switch(current_state) { | ||||||
|             case SERVER_STATE_READY: |             case SERVER_STATE_READY: | ||||||
|                 res.set_content(R"({"status": "ok"})", "application/json"); |                 if (llama.all_slots_are_idle) { | ||||||
|                 res.status = 200; // HTTP OK |                     res.set_content(R"({"status": "ok"})", "application/json"); | ||||||
|  |                     res.status = 200; // HTTP OK | ||||||
|  |                 } else { | ||||||
|  |                     int available_slots = 0; | ||||||
|  |                     int processing_slots = 0; | ||||||
|  |                     for (llama_client_slot & slot : llama.slots) { | ||||||
|  |                         if (slot.available()) { | ||||||
|  |                             available_slots++; | ||||||
|  |                         } else { | ||||||
|  |                             processing_slots++; | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                     if (available_slots > 0) { | ||||||
|  |                         json health = { | ||||||
|  |                                 {"status",           "ok"}, | ||||||
|  |                                 {"slots_idle",       available_slots}, | ||||||
|  |                                 {"slots_processing", processing_slots}}; | ||||||
|  |                         res.set_content(health.dump(), "application/json"); | ||||||
|  |                         res.status = 200; // HTTP OK | ||||||
|  |                     } else { | ||||||
|  |                         json health = { | ||||||
|  |                                 {"status",           "no slot available"}, | ||||||
|  |                                 {"slots_idle",       available_slots}, | ||||||
|  |                                 {"slots_processing", processing_slots}}; | ||||||
|  |                         res.set_content(health.dump(), "application/json"); | ||||||
|  |                         res.status = 503; // HTTP Service Unavailable | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|                 break; |                 break; | ||||||
|             case SERVER_STATE_LOADING_MODEL: |             case SERVER_STATE_LOADING_MODEL: | ||||||
|                 res.set_content(R"({"status": "loading model"})", "application/json"); |                 res.set_content(R"({"status": "loading model"})", "application/json"); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user