mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	server : simplify state machine for slot (#9283)
* server : simplify state machine for slot * add SLOT_STATE_DONE_PROMPT * pop_deferred_task * add missing notify_one * fix passkey test * metrics : add n_busy_slots_per_decode * fix test step * add test * maybe fix AddressSanitizer? * fix deque ? * missing lock * pop_deferred_task: also notify * Update examples/server/server.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -50,15 +50,12 @@ enum stop_type { | |||||||
|     STOP_TYPE_PARTIAL, |     STOP_TYPE_PARTIAL, | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 | ||||||
| enum slot_state { | enum slot_state { | ||||||
|     SLOT_STATE_IDLE, |     SLOT_STATE_IDLE, | ||||||
|     SLOT_STATE_PROCESSING, |     SLOT_STATE_PROCESSING_PROMPT, | ||||||
| }; |     SLOT_STATE_DONE_PROMPT, | ||||||
|  |     SLOT_STATE_GENERATING, | ||||||
| enum slot_command { |  | ||||||
|     SLOT_COMMAND_NONE, |  | ||||||
|     SLOT_COMMAND_LOAD_PROMPT, |  | ||||||
|     SLOT_COMMAND_RELEASE, |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| enum server_state { | enum server_state { | ||||||
| @@ -135,7 +132,6 @@ struct server_slot { | |||||||
|     struct slot_params params; |     struct slot_params params; | ||||||
|  |  | ||||||
|     slot_state state = SLOT_STATE_IDLE; |     slot_state state = SLOT_STATE_IDLE; | ||||||
|     slot_command command = SLOT_COMMAND_NONE; |  | ||||||
|  |  | ||||||
|     // used to determine the slot that has been used the longest |     // used to determine the slot that has been used the longest | ||||||
|     int64_t t_last_used = -1; |     int64_t t_last_used = -1; | ||||||
| @@ -194,6 +190,8 @@ struct server_slot { | |||||||
|     double t_prompt_processing; // ms |     double t_prompt_processing; // ms | ||||||
|     double t_token_generation; // ms |     double t_token_generation; // ms | ||||||
|  |  | ||||||
|  |     std::function<void(int)> callback_on_release; | ||||||
|  |  | ||||||
|     void reset() { |     void reset() { | ||||||
|         n_prompt_tokens    = 0; |         n_prompt_tokens    = 0; | ||||||
|         generated_text     = ""; |         generated_text     = ""; | ||||||
| @@ -228,25 +226,28 @@ struct server_slot { | |||||||
|         return n_remaining > 0; // no budget |         return n_remaining > 0; // no budget | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     bool available() const { |  | ||||||
|         return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     bool is_processing() const { |     bool is_processing() const { | ||||||
|         return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING; |         return state != SLOT_STATE_IDLE; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void add_token_string(const completion_token_output & token) { |     void add_token_string(const completion_token_output & token) { | ||||||
|         if (command == SLOT_COMMAND_RELEASE) { |         if (!is_processing()) { | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
|         generated_token_probs.push_back(token); |         generated_token_probs.push_back(token); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void release() { |     void release() { | ||||||
|         if (state == SLOT_STATE_PROCESSING) { |         if (is_processing()) { | ||||||
|             t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; |             t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; | ||||||
|             command = SLOT_COMMAND_RELEASE; |             state = SLOT_STATE_IDLE; | ||||||
|  |             LOG_INFO("slot released", { | ||||||
|  |                 {"id_slot",   id}, | ||||||
|  |                 {"id_task",   id_task}, | ||||||
|  |                 {"n_past",    n_past}, | ||||||
|  |                 {"truncated", truncated}, | ||||||
|  |             }); | ||||||
|  |             callback_on_release(id); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -353,6 +354,9 @@ struct server_metrics { | |||||||
|     uint64_t n_tokens_predicted  = 0; |     uint64_t n_tokens_predicted  = 0; | ||||||
|     uint64_t t_tokens_generation = 0; |     uint64_t t_tokens_generation = 0; | ||||||
|  |  | ||||||
|  |     uint64_t n_decode_total     = 0; | ||||||
|  |     uint64_t n_busy_slots_total = 0; | ||||||
|  |  | ||||||
|     void init() { |     void init() { | ||||||
|         t_start = ggml_time_us(); |         t_start = ggml_time_us(); | ||||||
|     } |     } | ||||||
| @@ -371,6 +375,15 @@ struct server_metrics { | |||||||
|         t_tokens_generation_total  += slot.t_token_generation; |         t_tokens_generation_total  += slot.t_token_generation; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     void on_decoded(const std::vector<server_slot> & slots) { | ||||||
|  |         n_decode_total++; | ||||||
|  |         for (const auto & slot : slots) { | ||||||
|  |             if (slot.is_processing()) { | ||||||
|  |                 n_busy_slots_total++; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     void reset_bucket() { |     void reset_bucket() { | ||||||
|         n_prompt_tokens_processed = 0; |         n_prompt_tokens_processed = 0; | ||||||
|         t_prompt_processing       = 0; |         t_prompt_processing       = 0; | ||||||
| @@ -432,6 +445,7 @@ struct server_queue { | |||||||
|     void defer(server_task task) { |     void defer(server_task task) { | ||||||
|         std::unique_lock<std::mutex> lock(mutex_tasks); |         std::unique_lock<std::mutex> lock(mutex_tasks); | ||||||
|         queue_tasks_deferred.push_back(std::move(task)); |         queue_tasks_deferred.push_back(std::move(task)); | ||||||
|  |         condition_tasks.notify_one(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Get the next id for creating a new task |     // Get the next id for creating a new task | ||||||
| @@ -452,14 +466,14 @@ struct server_queue { | |||||||
|         callback_update_slots = std::move(callback); |         callback_update_slots = std::move(callback); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Call when the state of one slot is changed |     // Call when the state of one slot is changed, it will move one task from deferred to main queue | ||||||
|     void notify_slot_changed() { |     void pop_deferred_task() { | ||||||
|         // move deferred tasks back to main loop |  | ||||||
|         std::unique_lock<std::mutex> lock(mutex_tasks); |         std::unique_lock<std::mutex> lock(mutex_tasks); | ||||||
|         for (auto & task : queue_tasks_deferred) { |         if (!queue_tasks_deferred.empty()) { | ||||||
|             queue_tasks.push_back(std::move(task)); |             queue_tasks.emplace_back(std::move(queue_tasks_deferred.front())); | ||||||
|  |             queue_tasks_deferred.pop_front(); | ||||||
|         } |         } | ||||||
|         queue_tasks_deferred.clear(); |         condition_tasks.notify_one(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // end the start_loop routine |     // end the start_loop routine | ||||||
| @@ -489,7 +503,7 @@ struct server_queue { | |||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
|                 server_task task = queue_tasks.front(); |                 server_task task = queue_tasks.front(); | ||||||
|                 queue_tasks.erase(queue_tasks.begin()); |                 queue_tasks.pop_front(); | ||||||
|                 lock.unlock(); |                 lock.unlock(); | ||||||
|                 LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); |                 LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); | ||||||
|                 callback_new_task(task); |                 callback_new_task(task); | ||||||
| @@ -717,6 +731,10 @@ struct server_context { | |||||||
|  |  | ||||||
|             slot.sparams = params.sparams; |             slot.sparams = params.sparams; | ||||||
|  |  | ||||||
|  |             slot.callback_on_release = [this](int) { | ||||||
|  |                 queue_tasks.pop_deferred_task(); | ||||||
|  |             }; | ||||||
|  |  | ||||||
|             slot.reset(); |             slot.reset(); | ||||||
|  |  | ||||||
|             slots.push_back(slot); |             slots.push_back(slot); | ||||||
| @@ -798,7 +816,7 @@ struct server_context { | |||||||
|  |  | ||||||
|             for (server_slot & slot : slots) { |             for (server_slot & slot : slots) { | ||||||
|                 // skip the slot if it is not available |                 // skip the slot if it is not available | ||||||
|                 if (!slot.available()) { |                 if (slot.is_processing()) { | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -840,7 +858,7 @@ struct server_context { | |||||||
|             int64_t t_last = ggml_time_us(); |             int64_t t_last = ggml_time_us(); | ||||||
|             for (server_slot & slot : slots) { |             for (server_slot & slot : slots) { | ||||||
|                 // skip the slot if it is not available |                 // skip the slot if it is not available | ||||||
|                 if (!slot.available()) { |                 if (slot.is_processing()) { | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -1078,7 +1096,7 @@ struct server_context { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         slot.command = SLOT_COMMAND_LOAD_PROMPT; |         slot.state = SLOT_STATE_PROCESSING_PROMPT; | ||||||
|         slot.prompt_tokens.clear(); |         slot.prompt_tokens.clear(); | ||||||
|  |  | ||||||
|         LOG_INFO("slot is processing task", { |         LOG_INFO("slot is processing task", { | ||||||
| @@ -1622,7 +1640,7 @@ struct server_context { | |||||||
|                         queue_tasks.defer(task); |                         queue_tasks.defer(task); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                     if (!slot->available()) { |                     if (slot->is_processing()) { | ||||||
|                         // if requested slot is unavailable, we defer this task for processing later |                         // if requested slot is unavailable, we defer this task for processing later | ||||||
|                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); |                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); | ||||||
|                         queue_tasks.defer(task); |                         queue_tasks.defer(task); | ||||||
| @@ -1728,6 +1746,9 @@ struct server_context { | |||||||
|                         { "n_tokens_predicted",              metrics.n_tokens_predicted}, |                         { "n_tokens_predicted",              metrics.n_tokens_predicted}, | ||||||
|                         { "t_tokens_generation",             metrics.t_tokens_generation}, |                         { "t_tokens_generation",             metrics.t_tokens_generation}, | ||||||
|  |  | ||||||
|  |                         { "n_decode_total",                  metrics.n_decode_total}, | ||||||
|  |                         { "n_busy_slots_total",              metrics.n_busy_slots_total}, | ||||||
|  |  | ||||||
|                         { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)}, |                         { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)}, | ||||||
|                         { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)}, |                         { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)}, | ||||||
|  |  | ||||||
| @@ -1747,7 +1768,7 @@ struct server_context { | |||||||
|                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); |                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                     if (!slot->available()) { |                     if (slot->is_processing()) { | ||||||
|                         // if requested slot is unavailable, we defer this task for processing later |                         // if requested slot is unavailable, we defer this task for processing later | ||||||
|                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); |                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); | ||||||
|                         queue_tasks.defer(task); |                         queue_tasks.defer(task); | ||||||
| @@ -1788,7 +1809,7 @@ struct server_context { | |||||||
|                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); |                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                     if (!slot->available()) { |                     if (slot->is_processing()) { | ||||||
|                         // if requested slot is unavailable, we defer this task for processing later |                         // if requested slot is unavailable, we defer this task for processing later | ||||||
|                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); |                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); | ||||||
|                         queue_tasks.defer(task); |                         queue_tasks.defer(task); | ||||||
| @@ -1836,7 +1857,7 @@ struct server_context { | |||||||
|                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); |                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                     if (!slot->available()) { |                     if (slot->is_processing()) { | ||||||
|                         // if requested slot is unavailable, we defer this task for processing later |                         // if requested slot is unavailable, we defer this task for processing later | ||||||
|                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); |                         LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); | ||||||
|                         queue_tasks.defer(task); |                         queue_tasks.defer(task); | ||||||
| @@ -1876,33 +1897,12 @@ struct server_context { | |||||||
|             system_prompt_update(); |             system_prompt_update(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // release slots |  | ||||||
|         for (auto & slot : slots) { |  | ||||||
|             if (slot.command == SLOT_COMMAND_RELEASE) { |  | ||||||
|                 slot.state       = SLOT_STATE_IDLE; |  | ||||||
|                 slot.command     = SLOT_COMMAND_NONE; |  | ||||||
|                 slot.t_last_used = ggml_time_us(); |  | ||||||
|  |  | ||||||
|                 LOG_INFO("slot released", { |  | ||||||
|                     {"id_slot",         slot.id}, |  | ||||||
|                     {"id_task",         slot.id_task}, |  | ||||||
|                     {"n_ctx",           n_ctx}, |  | ||||||
|                     {"n_past",          slot.n_past}, |  | ||||||
|                     {"n_system_tokens", system_tokens.size()}, |  | ||||||
|                     {"n_cache_tokens",  slot.cache_tokens.size()}, |  | ||||||
|                     {"truncated",       slot.truncated} |  | ||||||
|                 }); |  | ||||||
|  |  | ||||||
|                 queue_tasks.notify_slot_changed(); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // check if all slots are idle |         // check if all slots are idle | ||||||
|         { |         { | ||||||
|             bool all_idle = true; |             bool all_idle = true; | ||||||
|  |  | ||||||
|             for (auto & slot : slots) { |             for (auto & slot : slots) { | ||||||
|                 if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) { |                 if (slot.is_processing()) { | ||||||
|                     all_idle = false; |                     all_idle = false; | ||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
| @@ -1973,7 +1973,7 @@ struct server_context { | |||||||
|  |  | ||||||
|         // frist, add sampled tokens from any ongoing sequences |         // frist, add sampled tokens from any ongoing sequences | ||||||
|         for (auto & slot : slots) { |         for (auto & slot : slots) { | ||||||
|             if (slot.state == SLOT_STATE_IDLE) { |             if (slot.state != SLOT_STATE_GENERATING) { | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -2015,7 +2015,7 @@ struct server_context { | |||||||
|         if (params.cont_batching || batch.n_tokens == 0) { |         if (params.cont_batching || batch.n_tokens == 0) { | ||||||
|             for (auto & slot : slots) { |             for (auto & slot : slots) { | ||||||
|                 // this slot still has a prompt to be processed |                 // this slot still has a prompt to be processed | ||||||
|                 if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) { |                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT) { | ||||||
|                     auto & prompt_tokens = slot.prompt_tokens; |                     auto & prompt_tokens = slot.prompt_tokens; | ||||||
|  |  | ||||||
|                     // we haven't tokenized the prompt yet - do it now: |                     // we haven't tokenized the prompt yet - do it now: | ||||||
| @@ -2083,8 +2083,6 @@ struct server_context { | |||||||
|                                 {"id_task", slot.id_task} |                                 {"id_task", slot.id_task} | ||||||
|                             }); |                             }); | ||||||
|  |  | ||||||
|                             slot.state = SLOT_STATE_PROCESSING; |  | ||||||
|                             slot.command = SLOT_COMMAND_NONE; |  | ||||||
|                             slot.release(); |                             slot.release(); | ||||||
|                             slot.print_timings(); |                             slot.print_timings(); | ||||||
|                             send_final_response(slot); |                             send_final_response(slot); | ||||||
| @@ -2094,8 +2092,6 @@ struct server_context { | |||||||
|                         if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { |                         if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { | ||||||
|                             // this prompt is too large to process - discard it |                             // this prompt is too large to process - discard it | ||||||
|                             if (slot.n_prompt_tokens > n_ubatch) { |                             if (slot.n_prompt_tokens > n_ubatch) { | ||||||
|                                 slot.state = SLOT_STATE_PROCESSING; |  | ||||||
|                                 slot.command = SLOT_COMMAND_NONE; |  | ||||||
|                                 slot.release(); |                                 slot.release(); | ||||||
|                                 send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); |                                 send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); | ||||||
|                                 continue; |                                 continue; | ||||||
| @@ -2253,10 +2249,9 @@ struct server_context { | |||||||
|                         {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, |                         {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, | ||||||
|                     }); |                     }); | ||||||
|  |  | ||||||
|                     // entire prompt has been processed - start decoding new tokens |                     // entire prompt has been processed | ||||||
|                     if (slot.n_past == slot.n_prompt_tokens) { |                     if (slot.n_past == slot.n_prompt_tokens) { | ||||||
|                         slot.state   = SLOT_STATE_PROCESSING; |                         slot.state = SLOT_STATE_DONE_PROMPT; | ||||||
|                         slot.command = SLOT_COMMAND_NONE; |  | ||||||
|  |  | ||||||
|                         GGML_ASSERT(batch.n_tokens > 0); |                         GGML_ASSERT(batch.n_tokens > 0); | ||||||
|  |  | ||||||
| @@ -2338,18 +2333,17 @@ struct server_context { | |||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             const int ret = llama_decode(ctx, batch_view); |             const int ret = llama_decode(ctx, batch_view); | ||||||
|  |             metrics.on_decoded(slots); | ||||||
|  |  | ||||||
|             if (ret != 0) { |             if (ret != 0) { | ||||||
|                 if (n_batch == 1 || ret < 0) { |                 if (n_batch == 1 || ret < 0) { | ||||||
|                     // if you get here, it means the KV cache is full - try increasing it via the context size |                     // if you get here, it means the KV cache is full - try increasing it via the context size | ||||||
|                     LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { |                     LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { | ||||||
|                         {"i",   i}, |                         {"i",       i}, | ||||||
|                         {"n_batch",  ret}, |                         {"n_batch", n_batch}, | ||||||
|                         {"ret",   ret}, |                         {"ret",     ret}, | ||||||
|                     }); |                     }); | ||||||
|                     for (auto & slot : slots) { |                     for (auto & slot : slots) { | ||||||
|                         slot.state = SLOT_STATE_PROCESSING; |  | ||||||
|                         slot.command = SLOT_COMMAND_NONE; |  | ||||||
|                         slot.release(); |                         slot.release(); | ||||||
|                         send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); |                         send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); | ||||||
|                     } |                     } | ||||||
| @@ -2361,24 +2355,31 @@ struct server_context { | |||||||
|                 i -= n_batch; |                 i -= n_batch; | ||||||
|  |  | ||||||
|                 LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { |                 LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { | ||||||
|                     {"i",   i}, |                     {"i",       i}, | ||||||
|                     {"n_batch",  n_batch}, |                     {"n_batch", n_batch}, | ||||||
|                     {"ret",   ret}, |                     {"ret",     ret}, | ||||||
|                 }); |                 }); | ||||||
|  |  | ||||||
|                 continue; // continue loop of n_batch |                 continue; // continue loop of n_batch | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             for (auto & slot : slots) { |             for (auto & slot : slots) { | ||||||
|                 if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { |                 if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { | ||||||
|                     continue; // continue loop of slots |                     continue; // continue loop of slots | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 // prompt evaluated for embedding |                 if (slot.state == SLOT_STATE_DONE_PROMPT) { | ||||||
|                 if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { |                     if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { | ||||||
|                     send_embedding(slot, batch_view); |                         // prompt evaluated for embedding | ||||||
|                     slot.release(); |                         send_embedding(slot, batch_view); | ||||||
|                     slot.i_batch = -1; |                         slot.release(); | ||||||
|  |                         slot.i_batch = -1; | ||||||
|  |                         continue; // continue loop of slots | ||||||
|  |                     } else { | ||||||
|  |                         // prompt evaluated for next-token prediction | ||||||
|  |                         slot.state = SLOT_STATE_GENERATING; | ||||||
|  |                     } | ||||||
|  |                 } else if (slot.state != SLOT_STATE_GENERATING) { | ||||||
|                     continue; // continue loop of slots |                     continue; // continue loop of slots | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -2425,6 +2426,7 @@ struct server_context { | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 if (!process_token(result, slot)) { |                 if (!process_token(result, slot)) { | ||||||
|  |                     // release slot because of stop condition | ||||||
|                     slot.release(); |                     slot.release(); | ||||||
|                     slot.print_timings(); |                     slot.print_timings(); | ||||||
|                     send_final_response(slot); |                     send_final_response(slot); | ||||||
| @@ -2705,7 +2707,7 @@ int main(int argc, char ** argv) { | |||||||
|         task.type = SERVER_TASK_TYPE_METRICS; |         task.type = SERVER_TASK_TYPE_METRICS; | ||||||
|  |  | ||||||
|         ctx_server.queue_results.add_waiting_task_id(task.id); |         ctx_server.queue_results.add_waiting_task_id(task.id); | ||||||
|         ctx_server.queue_tasks.post(task); |         ctx_server.queue_tasks.post(task, true); // high-priority task | ||||||
|  |  | ||||||
|         // get the result |         // get the result | ||||||
|         server_task_result result = ctx_server.queue_results.recv(task.id); |         server_task_result result = ctx_server.queue_results.recv(task.id); | ||||||
| @@ -2737,7 +2739,7 @@ int main(int argc, char ** argv) { | |||||||
|         task.data.push_back({{"reset_bucket", true}}); |         task.data.push_back({{"reset_bucket", true}}); | ||||||
|  |  | ||||||
|         ctx_server.queue_results.add_waiting_task_id(task.id); |         ctx_server.queue_results.add_waiting_task_id(task.id); | ||||||
|         ctx_server.queue_tasks.post(task); |         ctx_server.queue_tasks.post(task, true); // high-priority task | ||||||
|  |  | ||||||
|         // get the result |         // get the result | ||||||
|         server_task_result result = ctx_server.queue_results.recv(task.id); |         server_task_result result = ctx_server.queue_results.recv(task.id); | ||||||
| @@ -2751,6 +2753,9 @@ int main(int argc, char ** argv) { | |||||||
|         const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted"); |         const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted"); | ||||||
|         const uint64_t t_tokens_generation = data.at("t_tokens_generation"); |         const uint64_t t_tokens_generation = data.at("t_tokens_generation"); | ||||||
|  |  | ||||||
|  |         const uint64_t n_decode_total     = data.at("n_decode_total"); | ||||||
|  |         const uint64_t n_busy_slots_total = data.at("n_busy_slots_total"); | ||||||
|  |  | ||||||
|         const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); |         const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); | ||||||
|  |  | ||||||
|         // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names |         // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names | ||||||
| @@ -2771,6 +2776,14 @@ int main(int argc, char ** argv) { | |||||||
|                     {"name",  "tokens_predicted_seconds_total"}, |                     {"name",  "tokens_predicted_seconds_total"}, | ||||||
|                     {"help",  "Predict process time"}, |                     {"help",  "Predict process time"}, | ||||||
|                     {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3} |                     {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3} | ||||||
|  |             }, { | ||||||
|  |                     {"name",  "n_decode_total"}, | ||||||
|  |                     {"help",  "Total number of llama_decode() calls"}, | ||||||
|  |                     {"value",  n_decode_total} | ||||||
|  |             }, { | ||||||
|  |                     {"name",  "n_busy_slots_per_decode"}, | ||||||
|  |                     {"help",  "Average number of busy slots per llama_decode() call"}, | ||||||
|  |                     {"value",  (float) n_busy_slots_total / (float) n_decode_total} | ||||||
|             }}}, |             }}}, | ||||||
|             {"gauge", {{ |             {"gauge", {{ | ||||||
|                     {"name",  "prompt_tokens_seconds"}, |                     {"name",  "prompt_tokens_seconds"}, | ||||||
| @@ -2837,7 +2850,7 @@ int main(int argc, char ** argv) { | |||||||
|         task.data = { |         task.data = { | ||||||
|             { "id_slot", id_slot }, |             { "id_slot", id_slot }, | ||||||
|             { "filename", filename }, |             { "filename", filename }, | ||||||
|             { "filepath", filepath } |             { "filepath", filepath }, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         const int id_task = ctx_server.queue_tasks.post(task); |         const int id_task = ctx_server.queue_tasks.post(task); | ||||||
| @@ -2867,7 +2880,7 @@ int main(int argc, char ** argv) { | |||||||
|         task.data = { |         task.data = { | ||||||
|             { "id_slot", id_slot }, |             { "id_slot", id_slot }, | ||||||
|             { "filename", filename }, |             { "filename", filename }, | ||||||
|             { "filepath", filepath } |             { "filepath", filepath }, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         const int id_task = ctx_server.queue_tasks.post(task); |         const int id_task = ctx_server.queue_tasks.post(task); | ||||||
| @@ -2945,7 +2958,7 @@ int main(int argc, char ** argv) { | |||||||
|             { "system_prompt",               ctx_server.system_prompt.c_str() }, |             { "system_prompt",               ctx_server.system_prompt.c_str() }, | ||||||
|             { "default_generation_settings", ctx_server.default_generation_settings_for_props }, |             { "default_generation_settings", ctx_server.default_generation_settings_for_props }, | ||||||
|             { "total_slots",                 ctx_server.params.n_parallel }, |             { "total_slots",                 ctx_server.params.n_parallel }, | ||||||
|             { "chat_template",               curr_tmpl.c_str() } |             { "chat_template",               curr_tmpl.c_str() }, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         res_ok(res, data); |         res_ok(res, data); | ||||||
| @@ -3056,13 +3069,13 @@ int main(int argc, char ** argv) { | |||||||
|         json models = { |         json models = { | ||||||
|             {"object", "list"}, |             {"object", "list"}, | ||||||
|             {"data", { |             {"data", { | ||||||
|                  { |                 { | ||||||
|                      {"id",       params.model_alias}, |                     {"id",       params.model_alias}, | ||||||
|                      {"object",   "model"}, |                     {"object",   "model"}, | ||||||
|                      {"created",  std::time(0)}, |                     {"created",  std::time(0)}, | ||||||
|                      {"owned_by", "llamacpp"}, |                     {"owned_by", "llamacpp"}, | ||||||
|                      {"meta",     ctx_server.model_meta()} |                     {"meta",     ctx_server.model_meta()} | ||||||
|                  }, |                 }, | ||||||
|              }} |              }} | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -77,6 +77,35 @@ Feature: Parallel | |||||||
|       | disabled  | 128       | |       | disabled  | 128       | | ||||||
|       | enabled   | 64        | |       | enabled   | 64        | | ||||||
|  |  | ||||||
|  |   Scenario Outline: Multi users with number of prompts exceeding number of slots | ||||||
|  |     Given a system prompt You are a writer. | ||||||
|  |     And   a model tinyllama-2 | ||||||
|  |     Given a prompt: | ||||||
|  |       """ | ||||||
|  |       Write a very long book. | ||||||
|  |       """ | ||||||
|  |     And a prompt: | ||||||
|  |       """ | ||||||
|  |       Write another a poem. | ||||||
|  |       """ | ||||||
|  |     And a prompt: | ||||||
|  |       """ | ||||||
|  |       What is LLM? | ||||||
|  |       """ | ||||||
|  |     And a prompt: | ||||||
|  |       """ | ||||||
|  |       The sky is blue and I love it. | ||||||
|  |       """ | ||||||
|  |     And <n_predict> max tokens to predict | ||||||
|  |     And streaming is <streaming> | ||||||
|  |     Given concurrent OAI completions requests | ||||||
|  |     Then the server is busy | ||||||
|  |     Then the server is idle | ||||||
|  |     Then all prompts are predicted with <n_predict> tokens | ||||||
|  |     Examples: | ||||||
|  |       | streaming | n_predict | | ||||||
|  |       | disabled  | 128       | | ||||||
|  |       | enabled   | 64        | | ||||||
|  |  | ||||||
|   Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969 |   Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969 | ||||||
|     Given a prompt: |     Given a prompt: | ||||||
|   | |||||||
| @@ -15,6 +15,7 @@ Feature: Passkey / Self-extend with context shift | |||||||
|     And   <n_junk> as number of junk |     And   <n_junk> as number of junk | ||||||
|     And   <n_predicted> server max tokens to predict |     And   <n_predicted> server max tokens to predict | ||||||
|     And   42 as seed |     And   42 as seed | ||||||
|  |     And   0.0 temperature | ||||||
|     And   <n_ctx> KV cache size |     And   <n_ctx> KV cache size | ||||||
|     And   1 slots |     And   1 slots | ||||||
|     And   <n_ga> group attention factor to extend context size through self-extend |     And   <n_ga> group attention factor to extend context size through self-extend | ||||||
| @@ -22,7 +23,8 @@ Feature: Passkey / Self-extend with context shift | |||||||
|     # Can be override with N_GPU_LAYERS |     # Can be override with N_GPU_LAYERS | ||||||
|     And   <ngl> GPU offloaded layers |     And   <ngl> GPU offloaded layers | ||||||
|     Then  the server is starting |     Then  the server is starting | ||||||
|     Then  the server is healthy |     # Higher timeout because the model may need to be downloaded from the internet | ||||||
|  |     Then  the server is healthy with timeout 120 seconds | ||||||
|     Given available models |     Given available models | ||||||
|     Then  model 0 is trained on <n_ctx_train> tokens context |     Then  model 0 is trained on <n_ctx_train> tokens context | ||||||
|     Given a prefix prompt: |     Given a prefix prompt: | ||||||
|   | |||||||
| @@ -202,17 +202,15 @@ def step_start_server(context): | |||||||
|             time.sleep(0.1) |             time.sleep(0.1) | ||||||
|  |  | ||||||
|  |  | ||||||
| @step("the server is {expecting_status}") | async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int): | ||||||
| @async_run_until_complete |  | ||||||
| async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): |  | ||||||
|     match expecting_status: |     match expecting_status: | ||||||
|         case 'healthy': |         case 'healthy': | ||||||
|             await wait_for_slots_status(context, context.base_url, 200, |             await wait_for_slots_status(context, context.base_url, 200, | ||||||
|                                         timeout=30) |                                         timeout=timeout) | ||||||
|  |  | ||||||
|         case 'ready' | 'idle': |         case 'ready' | 'idle': | ||||||
|             await wait_for_slots_status(context, context.base_url, 200, |             await wait_for_slots_status(context, context.base_url, 200, | ||||||
|                                         timeout=30, |                                         timeout=timeout, | ||||||
|                                         params={'fail_on_no_slot': 1}, |                                         params={'fail_on_no_slot': 1}, | ||||||
|                                         slots_idle=context.n_slots, |                                         slots_idle=context.n_slots, | ||||||
|                                         slots_processing=0) |                                         slots_processing=0) | ||||||
| @@ -225,6 +223,18 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status: Lite | |||||||
|             assert False, "unknown status" |             assert False, "unknown status" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @step("the server is {expecting_status} with timeout {timeout:d} seconds") | ||||||
|  | @async_run_until_complete | ||||||
|  | async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int): | ||||||
|  |     await wait_for_server_status_with_timeout(context, expecting_status, timeout) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @step("the server is {expecting_status}") | ||||||
|  | @async_run_until_complete | ||||||
|  | async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str): | ||||||
|  |     await wait_for_server_status_with_timeout(context, expecting_status, 30) | ||||||
|  |  | ||||||
|  |  | ||||||
| @step('all slots are {expected_slot_status_string}') | @step('all slots are {expected_slot_status_string}') | ||||||
| @async_run_until_complete | @async_run_until_complete | ||||||
| async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): | async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen