mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Server: normalize naming (#5779)
* server: normalize naming * fix spacing
This commit is contained in:
		| @@ -33,8 +33,7 @@ | |||||||
|  |  | ||||||
| using json = nlohmann::json; | using json = nlohmann::json; | ||||||
|  |  | ||||||
| struct server_params | struct server_params { | ||||||
| { |  | ||||||
|     std::string hostname = "127.0.0.1"; |     std::string hostname = "127.0.0.1"; | ||||||
|     std::vector<std::string> api_keys; |     std::vector<std::string> api_keys; | ||||||
|     std::string public_path = "examples/server/public"; |     std::string public_path = "examples/server/public"; | ||||||
| @@ -49,103 +48,50 @@ struct server_params | |||||||
| bool server_verbose = false; | bool server_verbose = false; | ||||||
| bool server_log_json = true; | bool server_log_json = true; | ||||||
|  |  | ||||||
| static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b) | enum stop_type { | ||||||
| { |  | ||||||
|     size_t i; |  | ||||||
|     for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) |  | ||||||
|     { |  | ||||||
|     } |  | ||||||
|     return i; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| enum stop_type |  | ||||||
| { |  | ||||||
|     STOP_FULL, |     STOP_FULL, | ||||||
|     STOP_PARTIAL, |     STOP_PARTIAL, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static bool ends_with(const std::string &str, const std::string &suffix) | // TODO: can become bool if we can't find use of more states | ||||||
| { | enum slot_state { | ||||||
|     return str.size() >= suffix.size() && |     IDLE, | ||||||
|            0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); |     PROCESSING, | ||||||
| } | }; | ||||||
|  |  | ||||||
| static size_t find_partial_stop_string(const std::string &stop, | enum slot_command { | ||||||
|                                        const std::string &text) |     NONE, | ||||||
| { |     LOAD_PROMPT, | ||||||
|     if (!text.empty() && !stop.empty()) |     RELEASE, | ||||||
|     { | }; | ||||||
|         const char text_last_char = text.back(); |  | ||||||
|         for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) |  | ||||||
|         { |  | ||||||
|             if (stop[char_index] == text_last_char) |  | ||||||
|             { |  | ||||||
|                 const std::string current_partial = stop.substr(0, char_index + 1); |  | ||||||
|                 if (ends_with(text, current_partial)) |  | ||||||
|                 { |  | ||||||
|                     return text.size() - char_index - 1; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return std::string::npos; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // TODO: reuse llama_detokenize | struct slot_params { | ||||||
| template <class Iter> |     bool stream       = true; | ||||||
| static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) |     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt | ||||||
| { |  | ||||||
|     std::string ret; |  | ||||||
|     for (; begin != end; ++begin) |  | ||||||
|     { |  | ||||||
|         ret += llama_token_to_piece(ctx, *begin); |  | ||||||
|     } |  | ||||||
|     return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // format incomplete utf-8 multibyte character for output |     uint32_t seed      = -1; // RNG seed | ||||||
| static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) |     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt | ||||||
| { |     int32_t  n_predict = -1; // new tokens to predict | ||||||
|     std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); |  | ||||||
|     // if the size is 1 and first bit is 1, meaning it's a partial character |  | ||||||
|     //   (size > 1 meaning it's already a known token) |  | ||||||
|     if (out.size() == 1 && (out[0] & 0x80) == 0x80) |  | ||||||
|     { |  | ||||||
|         std::stringstream ss; |  | ||||||
|         ss << std::hex << (out[0] & 0xff); |  | ||||||
|         std::string res(ss.str()); |  | ||||||
|         out = "byte: \\x" + res; |  | ||||||
|     } |  | ||||||
|     return out; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // convert a vector of completion_token_output to json |     std::vector<std::string> antiprompt; | ||||||
| static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs) |  | ||||||
| { |  | ||||||
|     json out = json::array(); |  | ||||||
|     for (const auto &prob : probs) |  | ||||||
|     { |  | ||||||
|         json probs_for_token = json::array(); |  | ||||||
|         for (const auto &p : prob.probs) |  | ||||||
|         { |  | ||||||
|             std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); |  | ||||||
|             probs_for_token.push_back(json |  | ||||||
|             { |  | ||||||
|                 {"tok_str", tok_str}, |  | ||||||
|                 {"prob",    p.prob}, |  | ||||||
|             }); |  | ||||||
|         } |  | ||||||
|         std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); |  | ||||||
|         out.push_back(json{ |  | ||||||
|             {"content", tok_str}, |  | ||||||
|             {"probs",   probs_for_token}, |  | ||||||
|         }); |  | ||||||
|     } |  | ||||||
|     return out; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| struct llama_client_slot |     json input_prefix; | ||||||
| { |     json input_suffix; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | struct slot_image { | ||||||
|  |     int32_t id; | ||||||
|  |  | ||||||
|  |     bool request_encode_image = false; | ||||||
|  |     float * image_embedding = nullptr; | ||||||
|  |     int32_t image_tokens = 0; | ||||||
|  |  | ||||||
|  |     clip_image_u8 * img_data; | ||||||
|  |  | ||||||
|  |     std::string prefix_prompt; // before of this image | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | struct server_slot { | ||||||
|     int id; |     int id; | ||||||
|     int task_id = -1; |     int task_id = -1; | ||||||
|  |  | ||||||
| @@ -165,8 +111,8 @@ struct llama_client_slot | |||||||
|     int32_t i_batch     = -1; |     int32_t i_batch     = -1; | ||||||
|     int32_t n_predict   = -1; |     int32_t n_predict   = -1; | ||||||
|  |  | ||||||
|     int32_t num_prompt_tokens           = 0; |     int32_t n_prompt_tokens           = 0; | ||||||
|     int32_t num_prompt_tokens_processed = 0; |     int32_t n_prompt_tokens_processed = 0; | ||||||
|  |  | ||||||
|     json prompt; |     json prompt; | ||||||
|     std::string generated_text; |     std::string generated_text; | ||||||
| @@ -201,8 +147,8 @@ struct llama_client_slot | |||||||
|     std::vector<slot_image> images; |     std::vector<slot_image> images; | ||||||
|  |  | ||||||
|     // stats |     // stats | ||||||
|     size_t sent_count = 0; |     size_t n_sent_text = 0; // number of sent text character | ||||||
|     size_t sent_token_probs_index = 0; |     size_t n_sent_token_probs = 0; | ||||||
|  |  | ||||||
|     int64_t t_start_process_prompt; |     int64_t t_start_process_prompt; | ||||||
|     int64_t t_start_genereration; |     int64_t t_start_genereration; | ||||||
| @@ -214,7 +160,7 @@ struct llama_client_slot | |||||||
|     int multitask_id = -1; |     int multitask_id = -1; | ||||||
|  |  | ||||||
|     void reset() { |     void reset() { | ||||||
|         num_prompt_tokens      = 0; |         n_prompt_tokens        = 0; | ||||||
|         generated_text         = ""; |         generated_text         = ""; | ||||||
|         truncated              = false; |         truncated              = false; | ||||||
|         stopped_eos            = false; |         stopped_eos            = false; | ||||||
| @@ -222,16 +168,15 @@ struct llama_client_slot | |||||||
|         stopped_limit          = false; |         stopped_limit          = false; | ||||||
|         stopping_word          = ""; |         stopping_word          = ""; | ||||||
|         n_past                 = 0; |         n_past                 = 0; | ||||||
|         sent_count             = 0; |         n_sent_text            = 0; | ||||||
|         sent_token_probs_index = 0; |         n_sent_token_probs     = 0; | ||||||
|         infill                 = false; |         infill                 = false; | ||||||
|         ga_i                   = 0; |         ga_i                   = 0; | ||||||
|         n_past_se              = 0; |         n_past_se              = 0; | ||||||
|  |  | ||||||
|         generated_token_probs.clear(); |         generated_token_probs.clear(); | ||||||
|  |  | ||||||
|         for (slot_image & img : images) |         for (slot_image & img : images) { | ||||||
|         { |  | ||||||
|             free(img.image_embedding); |             free(img.image_embedding); | ||||||
|             if (img.img_data) { |             if (img.img_data) { | ||||||
|                 clip_image_u8_free(img.img_data); |                 clip_image_u8_free(img.img_data); | ||||||
| @@ -243,19 +188,15 @@ struct llama_client_slot | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     bool has_budget(gpt_params &global_params) { |     bool has_budget(gpt_params &global_params) { | ||||||
|         if (params.n_predict == -1 && global_params.n_predict == -1) |         if (params.n_predict == -1 && global_params.n_predict == -1) { | ||||||
|         { |  | ||||||
|             return true; // limitless |             return true; // limitless | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         n_remaining = -1; |         n_remaining = -1; | ||||||
|  |  | ||||||
|         if (params.n_predict != -1) |         if (params.n_predict != -1) { | ||||||
|         { |  | ||||||
|             n_remaining = params.n_predict - n_decoded; |             n_remaining = params.n_predict - n_decoded; | ||||||
|         } |         } else if (global_params.n_predict != -1) { | ||||||
|         else if (global_params.n_predict != -1) |  | ||||||
|         { |  | ||||||
|             n_remaining = global_params.n_predict - n_decoded; |             n_remaining = global_params.n_predict - n_decoded; | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -271,8 +212,7 @@ struct llama_client_slot | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     void add_token_string(const completion_token_output &token) { |     void add_token_string(const completion_token_output &token) { | ||||||
|         if (command == RELEASE) |         if (command == RELEASE) { | ||||||
|         { |  | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
|         cache_tokens.push_back(token.tok); |         cache_tokens.push_back(token.tok); | ||||||
| @@ -290,10 +230,10 @@ struct llama_client_slot | |||||||
|     json get_formated_timings() { |     json get_formated_timings() { | ||||||
|         return json |         return json | ||||||
|         { |         { | ||||||
|             {"prompt_n",               num_prompt_tokens_processed}, |             {"prompt_n",               n_prompt_tokens_processed}, | ||||||
|             {"prompt_ms",              t_prompt_processing}, |             {"prompt_ms",              t_prompt_processing}, | ||||||
|             {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed}, |             {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed}, | ||||||
|             {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed}, |             {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed}, | ||||||
|  |  | ||||||
|             {"predicted_n",            n_decoded}, |             {"predicted_n",            n_decoded}, | ||||||
|             {"predicted_ms",           t_token_generation}, |             {"predicted_ms",           t_token_generation}, | ||||||
| @@ -304,16 +244,16 @@ struct llama_client_slot | |||||||
|  |  | ||||||
|     void print_timings() const { |     void print_timings() const { | ||||||
|        char buffer[512]; |        char buffer[512]; | ||||||
|         double t_token = t_prompt_processing / num_prompt_tokens_processed; |         double t_token = t_prompt_processing / n_prompt_tokens_processed; | ||||||
|         double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; |         double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; | ||||||
|         sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", |         sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", | ||||||
|                 t_prompt_processing, num_prompt_tokens_processed, |                 t_prompt_processing, n_prompt_tokens_processed, | ||||||
|                 t_token, n_tokens_second); |                 t_token, n_tokens_second); | ||||||
|         LOG_INFO(buffer, { |         LOG_INFO(buffer, { | ||||||
|             {"slot_id",                   id}, |             {"slot_id",                   id}, | ||||||
|             {"task_id",                   task_id}, |             {"task_id",                   task_id}, | ||||||
|             {"t_prompt_processing",       t_prompt_processing}, |             {"t_prompt_processing",       t_prompt_processing}, | ||||||
|             {"num_prompt_tokens_processed", num_prompt_tokens_processed}, |             {"n_prompt_tokens_processed", n_prompt_tokens_processed}, | ||||||
|             {"t_token",                   t_token}, |             {"t_token",                   t_token}, | ||||||
|             {"n_tokens_second",           n_tokens_second}, |             {"n_tokens_second",           n_tokens_second}, | ||||||
|         }); |         }); | ||||||
| @@ -343,7 +283,7 @@ struct llama_client_slot | |||||||
|     } |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct llama_metrics { | struct server_metrics { | ||||||
|     uint64_t n_prompt_tokens_processed_total = 0; |     uint64_t n_prompt_tokens_processed_total = 0; | ||||||
|     uint64_t n_tokens_predicted_total        = 0; |     uint64_t n_tokens_predicted_total        = 0; | ||||||
|  |  | ||||||
| @@ -354,16 +294,14 @@ struct llama_metrics { | |||||||
|     uint64_t t_tokens_generation      = 0; |     uint64_t t_tokens_generation      = 0; | ||||||
|  |  | ||||||
|  |  | ||||||
|     void on_prompt_eval(const llama_client_slot &slot) { |     void on_prompt_eval(const server_slot &slot) { | ||||||
|         n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; |         n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; | ||||||
|  |         n_prompt_tokens_processed       += slot.n_prompt_tokens_processed; | ||||||
|         n_prompt_tokens_processed += slot.num_prompt_tokens_processed; |  | ||||||
|         t_prompt_processing             += slot.t_prompt_processing; |         t_prompt_processing             += slot.t_prompt_processing; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void on_prediction(const llama_client_slot &slot) { |     void on_prediction(const server_slot &slot) { | ||||||
|         n_tokens_predicted_total += slot.n_decoded; |         n_tokens_predicted_total += slot.n_decoded; | ||||||
|  |  | ||||||
|         n_tokens_predicted       += slot.n_decoded; |         n_tokens_predicted       += slot.n_decoded; | ||||||
|         t_tokens_generation      += slot.t_token_generation; |         t_tokens_generation      += slot.t_token_generation; | ||||||
|     } |     } | ||||||
| @@ -404,13 +342,13 @@ struct llama_server_context | |||||||
|     std::string name_assistant; |     std::string name_assistant; | ||||||
|  |  | ||||||
|     // slots / clients |     // slots / clients | ||||||
|     std::vector<llama_client_slot> slots; |     std::vector<server_slot> slots; | ||||||
|     json default_generation_settings_for_props; |     json default_generation_settings_for_props; | ||||||
|  |  | ||||||
|     llama_server_queue    queue_tasks; |     llama_server_queue    queue_tasks; | ||||||
|     llama_server_response queue_results; |     llama_server_response queue_results; | ||||||
|  |  | ||||||
|     llama_metrics metrics; |     server_metrics metrics; | ||||||
|  |  | ||||||
|     ~llama_server_context() |     ~llama_server_context() | ||||||
|     { |     { | ||||||
| @@ -487,7 +425,7 @@ struct llama_server_context | |||||||
|         LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); |         LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); | ||||||
|         for (int i = 0; i < params.n_parallel; i++) |         for (int i = 0; i < params.n_parallel; i++) | ||||||
|         { |         { | ||||||
|             llama_client_slot slot; |             server_slot slot; | ||||||
|  |  | ||||||
|             slot.id = i; |             slot.id = i; | ||||||
|             slot.n_ctx = n_ctx_slot; |             slot.n_ctx = n_ctx_slot; | ||||||
| @@ -579,11 +517,11 @@ struct llama_server_context | |||||||
|         return prompt_tokens; |         return prompt_tokens; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     llama_client_slot* get_slot(int id) { |     server_slot* get_slot(int id) { | ||||||
|         int64_t t_last = ggml_time_us(); |         int64_t t_last = ggml_time_us(); | ||||||
|         llama_client_slot *last_used = nullptr; |         server_slot *last_used = nullptr; | ||||||
|  |  | ||||||
|         for (llama_client_slot & slot : slots) |         for (server_slot & slot : slots) | ||||||
|         { |         { | ||||||
|             if (slot.id == id && slot.available()) |             if (slot.id == id && slot.available()) | ||||||
|             { |             { | ||||||
| @@ -600,7 +538,7 @@ struct llama_server_context | |||||||
|         return last_used; |         return last_used; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     bool launch_slot_with_data(llama_client_slot* &slot, json data) { |     bool launch_slot_with_data(server_slot* &slot, json data) { | ||||||
|         slot_params default_params; |         slot_params default_params; | ||||||
|         llama_sampling_params default_sparams; |         llama_sampling_params default_sparams; | ||||||
|  |  | ||||||
| @@ -888,7 +826,7 @@ struct llama_server_context | |||||||
|         clean_kv_cache = false; |         clean_kv_cache = false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void update_system_prompt() { |     void system_prompt_update() { | ||||||
|         kv_cache_clear(); |         kv_cache_clear(); | ||||||
|         system_tokens.clear(); |         system_tokens.clear(); | ||||||
|  |  | ||||||
| @@ -933,9 +871,9 @@ struct llama_server_context | |||||||
|         system_need_update = false; |         system_need_update = false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void notify_system_prompt_changed() { |     void system_prompt_notify() { | ||||||
|         // release all slots |         // release all slots | ||||||
|         for (llama_client_slot &slot : slots) |         for (server_slot &slot : slots) | ||||||
|         { |         { | ||||||
|             slot.release(); |             slot.release(); | ||||||
|         } |         } | ||||||
| @@ -943,17 +881,17 @@ struct llama_server_context | |||||||
|         system_need_update = true; |         system_need_update = true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void process_system_prompt_data(const json &sys_props) { |     void system_prompt_process(const json &sys_props) { | ||||||
|         system_prompt  = sys_props.value("prompt", ""); |         system_prompt  = sys_props.value("prompt", ""); | ||||||
|         name_user      = sys_props.value("anti_prompt", ""); |         name_user      = sys_props.value("anti_prompt", ""); | ||||||
|         name_assistant = sys_props.value("assistant_name", ""); |         name_assistant = sys_props.value("assistant_name", ""); | ||||||
|  |  | ||||||
|  |  | ||||||
|         notify_system_prompt_changed(); |         system_prompt_notify(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, |     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, | ||||||
|                                         const stop_type type, llama_client_slot &slot) |                                         const stop_type type, server_slot &slot) | ||||||
|     { |     { | ||||||
|         size_t stop_pos = std::string::npos; |         size_t stop_pos = std::string::npos; | ||||||
|  |  | ||||||
| @@ -986,7 +924,7 @@ struct llama_server_context | |||||||
|         return stop_pos; |         return stop_pos; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     bool process_token(completion_token_output &result, llama_client_slot &slot) { |     bool process_token(completion_token_output &result, server_slot &slot) { | ||||||
|         // remember which tokens were sampled - used for repetition penalties during sampling |         // remember which tokens were sampled - used for repetition penalties during sampling | ||||||
|         const std::string token_str = llama_token_to_piece(ctx, result.tok); |         const std::string token_str = llama_token_to_piece(ctx, result.tok); | ||||||
|         slot.sampled = result.tok; |         slot.sampled = result.tok; | ||||||
| @@ -1032,7 +970,7 @@ struct llama_server_context | |||||||
|  |  | ||||||
|         if (!incomplete) |         if (!incomplete) | ||||||
|         { |         { | ||||||
|             size_t pos = std::min(slot.sent_count, slot.generated_text.size()); |             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); | ||||||
|             const std::string str_test = slot.generated_text.substr(pos); |             const std::string str_test = slot.generated_text.substr(pos); | ||||||
|             bool is_stop_full = false; |             bool is_stop_full = false; | ||||||
|             size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); |             size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); | ||||||
| @@ -1042,7 +980,7 @@ struct llama_server_context | |||||||
|                 slot.generated_text.erase( |                 slot.generated_text.erase( | ||||||
|                     slot.generated_text.begin() + pos + stop_pos, |                     slot.generated_text.begin() + pos + stop_pos, | ||||||
|                     slot.generated_text.end()); |                     slot.generated_text.end()); | ||||||
|                 pos = std::min(slot.sent_count, slot.generated_text.size()); |                 pos = std::min(slot.n_sent_text, slot.generated_text.size()); | ||||||
|             } |             } | ||||||
|             else |             else | ||||||
|             { |             { | ||||||
| @@ -1055,7 +993,7 @@ struct llama_server_context | |||||||
|             { |             { | ||||||
|                 // no send the stop word in the response |                 // no send the stop word in the response | ||||||
|                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos); |                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos); | ||||||
|                 slot.sent_count += result.text_to_send.size(); |                 slot.n_sent_text += result.text_to_send.size(); | ||||||
|                 // add the token to slot queue and cache |                 // add the token to slot queue and cache | ||||||
|             } |             } | ||||||
|             slot.add_token_string(result); |             slot.add_token_string(result); | ||||||
| @@ -1099,7 +1037,7 @@ struct llama_server_context | |||||||
|         return slot.has_next_token; // continue |         return slot.has_next_token; // continue | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     bool process_images(llama_client_slot &slot) const |     bool process_images(server_slot &slot) const | ||||||
|     { |     { | ||||||
|         for (slot_image &img : slot.images) |         for (slot_image &img : slot.images) | ||||||
|         { |         { | ||||||
| @@ -1132,7 +1070,7 @@ struct llama_server_context | |||||||
|         queue_results.send(res); |         queue_results.send(res); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     json get_formated_generation(llama_client_slot &slot) |     json get_formated_generation(server_slot &slot) | ||||||
|     { |     { | ||||||
|         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); |         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); | ||||||
|         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && |         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && | ||||||
| @@ -1179,7 +1117,7 @@ struct llama_server_context | |||||||
|         }; |         }; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void send_partial_response(llama_client_slot &slot, completion_token_output tkn) |     void send_partial_response(server_slot &slot, completion_token_output tkn) | ||||||
|     { |     { | ||||||
|         task_result res; |         task_result res; | ||||||
|         res.id = slot.task_id; |         res.id = slot.task_id; | ||||||
| @@ -1199,13 +1137,13 @@ struct llama_server_context | |||||||
|         { |         { | ||||||
|             std::vector<completion_token_output> probs_output = {}; |             std::vector<completion_token_output> probs_output = {}; | ||||||
|             const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); |             const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); | ||||||
|             size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size()); |             size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size()); | ||||||
|             size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size()); |             size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); | ||||||
|             if (probs_pos < probs_stop_pos) |             if (probs_pos < probs_stop_pos) | ||||||
|             { |             { | ||||||
|                 probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); |                 probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); | ||||||
|             } |             } | ||||||
|             slot.sent_token_probs_index = probs_stop_pos; |             slot.n_sent_token_probs = probs_stop_pos; | ||||||
|             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); |             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -1218,7 +1156,7 @@ struct llama_server_context | |||||||
|         queue_results.send(res); |         queue_results.send(res); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void send_final_response(llama_client_slot &slot) |     void send_final_response(server_slot &slot) | ||||||
|     { |     { | ||||||
|         task_result res; |         task_result res; | ||||||
|         res.id = slot.task_id; |         res.id = slot.task_id; | ||||||
| @@ -1233,7 +1171,7 @@ struct llama_server_context | |||||||
|             {"stop",                true}, |             {"stop",                true}, | ||||||
|             {"model",               params.model_alias}, |             {"model",               params.model_alias}, | ||||||
|             {"tokens_predicted",    slot.n_decoded}, |             {"tokens_predicted",    slot.n_decoded}, | ||||||
|             {"tokens_evaluated",    slot.num_prompt_tokens}, |             {"tokens_evaluated",    slot.n_prompt_tokens}, | ||||||
|             {"generation_settings", get_formated_generation(slot)}, |             {"generation_settings", get_formated_generation(slot)}, | ||||||
|             {"prompt",              slot.prompt}, |             {"prompt",              slot.prompt}, | ||||||
|             {"truncated",           slot.truncated}, |             {"truncated",           slot.truncated}, | ||||||
| @@ -1271,7 +1209,7 @@ struct llama_server_context | |||||||
|         queue_results.send(res); |         queue_results.send(res); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void send_embedding(llama_client_slot &slot) |     void send_embedding(server_slot &slot) | ||||||
|     { |     { | ||||||
|         task_result res; |         task_result res; | ||||||
|         res.id = slot.task_id; |         res.id = slot.task_id; | ||||||
| @@ -1282,9 +1220,7 @@ struct llama_server_context | |||||||
|         const int n_embd = llama_n_embd(model); |         const int n_embd = llama_n_embd(model); | ||||||
|         if (!params.embedding) |         if (!params.embedding) | ||||||
|         { |         { | ||||||
|             LOG_WARNING("embedding disabled", { |             LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}}); | ||||||
|                                                   {"params.embedding", params.embedding}, |  | ||||||
|                                               }); |  | ||||||
|             res.result_json = json |             res.result_json = json | ||||||
|             { |             { | ||||||
|                 {"embedding", std::vector<float>(n_embd, 0.0f)}, |                 {"embedding", std::vector<float>(n_embd, 0.0f)}, | ||||||
| @@ -1296,7 +1232,7 @@ struct llama_server_context | |||||||
|             std::vector<float> embedding(data, data + n_embd); |             std::vector<float> embedding(data, data + n_embd); | ||||||
|             res.result_json = json |             res.result_json = json | ||||||
|             { |             { | ||||||
|                 {"embedding", embedding }, |                 {"embedding", embedding}, | ||||||
|             }; |             }; | ||||||
|         } |         } | ||||||
|         queue_results.send(res); |         queue_results.send(res); | ||||||
| @@ -1345,7 +1281,7 @@ struct llama_server_context | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // for multiple images processing |     // for multiple images processing | ||||||
|     bool ingest_images(llama_client_slot &slot, int n_batch) |     bool ingest_images(server_slot &slot, int n_batch) | ||||||
|     { |     { | ||||||
|         int image_idx = 0; |         int image_idx = 0; | ||||||
|  |  | ||||||
| @@ -1384,7 +1320,17 @@ struct llama_server_context | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 const int n_embd = llama_n_embd(model); |                 const int n_embd = llama_n_embd(model); | ||||||
|                 llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; |                 llama_batch batch_img = { | ||||||
|  |                     n_eval, | ||||||
|  |                     nullptr, | ||||||
|  |                     (img.image_embedding + i * n_embd), | ||||||
|  |                     nullptr, | ||||||
|  |                     nullptr, | ||||||
|  |                     nullptr, | ||||||
|  |                     nullptr, | ||||||
|  |                     slot.n_past, | ||||||
|  |                     1, 0 | ||||||
|  |                 }; | ||||||
|                 if (llama_decode(ctx, batch_img)) |                 if (llama_decode(ctx, batch_img)) | ||||||
|                 { |                 { | ||||||
|                     LOG_TEE("%s : failed to eval image\n", __func__); |                     LOG_TEE("%s : failed to eval image\n", __func__); | ||||||
| @@ -1454,7 +1400,7 @@ struct llama_server_context | |||||||
|         switch (task.type) |         switch (task.type) | ||||||
|         { |         { | ||||||
|             case TASK_TYPE_COMPLETION: { |             case TASK_TYPE_COMPLETION: { | ||||||
|                 llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); |                 server_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); | ||||||
|                 if (slot == nullptr) |                 if (slot == nullptr) | ||||||
|                 { |                 { | ||||||
|                     // if no slot is available, we defer this task for processing later |                     // if no slot is available, we defer this task for processing later | ||||||
| @@ -1469,10 +1415,10 @@ struct llama_server_context | |||||||
|                         send_error(task, "system prompt can only be updated when all slots are idle"); |                         send_error(task, "system prompt can only be updated when all slots are idle"); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                     process_system_prompt_data(task.data["system_prompt"]); |                     system_prompt_process(task.data["system_prompt"]); | ||||||
|  |  | ||||||
|                     // reset cache_tokens for all slots |                     // reset cache_tokens for all slots | ||||||
|                     for (llama_client_slot &slot : slots) |                     for (server_slot &slot : slots) | ||||||
|                     { |                     { | ||||||
|                         slot.cache_tokens.clear(); |                         slot.cache_tokens.clear(); | ||||||
|                         slot.n_past    = 0; |                         slot.n_past    = 0; | ||||||
| @@ -1512,7 +1458,7 @@ struct llama_server_context | |||||||
|                 int n_idle_slots       = 0; |                 int n_idle_slots       = 0; | ||||||
|                 int n_processing_slots = 0; |                 int n_processing_slots = 0; | ||||||
|  |  | ||||||
|                 for (llama_client_slot &slot: slots) { |                 for (server_slot &slot: slots) { | ||||||
|                     json slot_data = get_formated_generation(slot); |                     json slot_data = get_formated_generation(slot); | ||||||
|                     slot_data["id"] = slot.id; |                     slot_data["id"] = slot.id; | ||||||
|                     slot_data["task_id"] = slot.task_id; |                     slot_data["task_id"] = slot.task_id; | ||||||
| @@ -1597,7 +1543,7 @@ struct llama_server_context | |||||||
|         if (system_need_update) |         if (system_need_update) | ||||||
|         { |         { | ||||||
|             LOG_INFO("updating system prompt", {}); |             LOG_INFO("updating system prompt", {}); | ||||||
|             update_system_prompt(); |             system_prompt_update(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         llama_batch_clear(batch); |         llama_batch_clear(batch); | ||||||
| @@ -1618,7 +1564,7 @@ struct llama_server_context | |||||||
|         task.target_id = -1; |         task.target_id = -1; | ||||||
|         queue_tasks.post(task); |         queue_tasks.post(task); | ||||||
|  |  | ||||||
|         for (llama_client_slot &slot : slots) |         for (server_slot &slot : slots) | ||||||
|         { |         { | ||||||
|             if (slot.ga_n == 1) |             if (slot.ga_n == 1) | ||||||
|             { |             { | ||||||
| @@ -1754,23 +1700,28 @@ struct llama_server_context | |||||||
|                         prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt |                         prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     slot.num_prompt_tokens = prompt_tokens.size(); |                     slot.n_prompt_tokens = prompt_tokens.size(); | ||||||
|  |  | ||||||
|                     if (slot.params.n_keep < 0) |                     if (slot.params.n_keep < 0) | ||||||
|                     { |                     { | ||||||
|                         slot.params.n_keep = slot.num_prompt_tokens; |                         slot.params.n_keep = slot.n_prompt_tokens; | ||||||
|                     } |                     } | ||||||
|                     slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); |                     slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); | ||||||
|  |  | ||||||
|                     // if input prompt is too big, truncate it |                     // if input prompt is too big, truncate it | ||||||
|                     if (slot.num_prompt_tokens >= slot.n_ctx) |                     if (slot.n_prompt_tokens >= slot.n_ctx) | ||||||
|                     { |                     { | ||||||
|                         const int n_left = slot.n_ctx - slot.params.n_keep; |                         const int n_left = slot.n_ctx - slot.params.n_keep; | ||||||
|                         const int n_block_size = n_left / 2; |                         const int n_block_size = n_left / 2; | ||||||
|                         const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; |                         const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; | ||||||
|  |  | ||||||
|                         std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); |                         std::vector<llama_token> new_tokens( | ||||||
|                         new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); |                             prompt_tokens.begin(), | ||||||
|  |                             prompt_tokens.begin() + slot.params.n_keep); | ||||||
|  |                         new_tokens.insert( | ||||||
|  |                             new_tokens.end(), | ||||||
|  |                             prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, | ||||||
|  |                             prompt_tokens.end()); | ||||||
|  |  | ||||||
|                         LOG_VERBOSE("input truncated", { |                         LOG_VERBOSE("input truncated", { | ||||||
|                             {"n_ctx",      slot.n_ctx}, |                             {"n_ctx",      slot.n_ctx}, | ||||||
| @@ -1781,8 +1732,8 @@ struct llama_server_context | |||||||
|                         slot.truncated = true; |                         slot.truncated = true; | ||||||
|                         prompt_tokens = new_tokens; |                         prompt_tokens = new_tokens; | ||||||
|  |  | ||||||
|                         slot.num_prompt_tokens = prompt_tokens.size(); |                         slot.n_prompt_tokens = prompt_tokens.size(); | ||||||
|                         GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); |                         GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     if (!slot.params.cache_prompt) |                     if (!slot.params.cache_prompt) | ||||||
| @@ -1792,7 +1743,7 @@ struct llama_server_context | |||||||
|                         slot.n_past    = 0; |                         slot.n_past    = 0; | ||||||
|                         slot.n_past_se = 0; |                         slot.n_past_se = 0; | ||||||
|                         slot.ga_i      = 0; |                         slot.ga_i      = 0; | ||||||
|                         slot.num_prompt_tokens_processed = slot.num_prompt_tokens; |                         slot.n_prompt_tokens_processed = slot.n_prompt_tokens; | ||||||
|                     } |                     } | ||||||
|                     else |                     else | ||||||
|                     { |                     { | ||||||
| @@ -1811,7 +1762,7 @@ struct llama_server_context | |||||||
|                             slot.n_past -= 1; |                             slot.n_past -= 1; | ||||||
|                         } |                         } | ||||||
|  |  | ||||||
|                         slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; |                         slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past; | ||||||
|  |  | ||||||
|                         if (slot.ga_n != 1) |                         if (slot.ga_n != 1) | ||||||
|                         { |                         { | ||||||
| @@ -1836,13 +1787,13 @@ struct llama_server_context | |||||||
|                             { "slot_id", slot.id }, |                             { "slot_id", slot.id }, | ||||||
|                             { "task_id", slot.task_id }, |                             { "task_id", slot.task_id }, | ||||||
|                             { "n_past",  slot.n_past }, |                             { "n_past",  slot.n_past }, | ||||||
|                             { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } |                             { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed } | ||||||
|                         }); |                         }); | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     slot.cache_tokens = prompt_tokens; |                     slot.cache_tokens = prompt_tokens; | ||||||
|  |  | ||||||
|                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) |                     if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) | ||||||
|                     { |                     { | ||||||
|                         // we have to evaluate at least 1 token to generate logits. |                         // we have to evaluate at least 1 token to generate logits. | ||||||
|                         LOG_INFO("we have to evaluate at least 1 token to generate logits", { |                         LOG_INFO("we have to evaluate at least 1 token to generate logits", { | ||||||
| @@ -1898,8 +1849,8 @@ struct llama_server_context | |||||||
|                     if (has_images && !ingest_images(slot, n_batch)) |                     if (has_images && !ingest_images(slot, n_batch)) | ||||||
|                     { |                     { | ||||||
|                         LOG_ERROR("failed processing images", { |                         LOG_ERROR("failed processing images", { | ||||||
|                             "slot_id", slot.id, |                             {"slot_id", slot.id}, | ||||||
|                             "task_id", slot.task_id, |                             {"task_id", slot.task_id}, | ||||||
|                         }); |                         }); | ||||||
|                         // FIXME @phymbert: to be properly tested |                         // FIXME @phymbert: to be properly tested | ||||||
|                         //  early returning without changing the slot state will block the slot for ever |                         //  early returning without changing the slot state will block the slot for ever | ||||||
| @@ -2049,10 +2000,6 @@ struct llama_server_context | |||||||
|         LOG_VERBOSE("slots updated", {}); |         LOG_VERBOSE("slots updated", {}); | ||||||
|         return true; |         return true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void run_on_all_tasks_finished() { |  | ||||||
|         update_slots(); |  | ||||||
|     } |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static void server_print_usage(const char *argv0, const gpt_params ¶ms, | static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||||
| @@ -2561,7 +2508,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, | |||||||
|                 std::istreambuf_iterator<char>(), |                 std::istreambuf_iterator<char>(), | ||||||
|                 std::back_inserter(systm_content) |                 std::back_inserter(systm_content) | ||||||
|             ); |             ); | ||||||
|             llama.process_system_prompt_data(json::parse(systm_content)); |             llama.system_prompt_process(json::parse(systm_content)); | ||||||
|         } |         } | ||||||
|         else if (arg == "-ctk" || arg == "--cache-type-k") { |         else if (arg == "-ctk" || arg == "--cache-type-k") { | ||||||
|             params.cache_type_k = argv[++i]; |             params.cache_type_k = argv[++i]; | ||||||
| @@ -2692,7 +2639,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, | |||||||
|  |  | ||||||
| /* llama.cpp completion api semantics */ | /* llama.cpp completion api semantics */ | ||||||
| static json format_partial_response( | static json format_partial_response( | ||||||
|     llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs |     llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs | ||||||
| ) { | ) { | ||||||
|     json res = json |     json res = json | ||||||
|     { |     { | ||||||
| @@ -2748,14 +2695,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo | |||||||
|     }); |     }); | ||||||
| } | } | ||||||
|  |  | ||||||
| struct token_translator | static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot) | ||||||
| { |  | ||||||
|     llama_context * ctx; |  | ||||||
|     std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); } |  | ||||||
|     std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) |  | ||||||
| { | { | ||||||
|     auto & gtps = slot->generated_token_probs; |     auto & gtps = slot->generated_token_probs; | ||||||
|     auto translator = token_translator{llama.ctx}; |     auto translator = token_translator{llama.ctx}; | ||||||
| @@ -3526,8 +3466,8 @@ int main(int argc, char **argv) | |||||||
|         &llama_server_context::process_single_task, &llama, std::placeholders::_1)); |         &llama_server_context::process_single_task, &llama, std::placeholders::_1)); | ||||||
|     llama.queue_tasks.on_finish_multitask(std::bind( |     llama.queue_tasks.on_finish_multitask(std::bind( | ||||||
|         &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); |         &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); | ||||||
|     llama.queue_tasks.on_all_tasks_finished(std::bind( |     llama.queue_tasks.on_run_slots(std::bind( | ||||||
|         &llama_server_context::run_on_all_tasks_finished, &llama)); |         &llama_server_context::update_slots, &llama)); | ||||||
|     llama.queue_results.on_multitask_update(std::bind( |     llama.queue_results.on_multitask_update(std::bind( | ||||||
|         &llama_server_queue::update_multitask, |         &llama_server_queue::update_multitask, | ||||||
|         &llama.queue_tasks, |         &llama.queue_tasks, | ||||||
|   | |||||||
| @@ -37,10 +37,6 @@ extern bool server_log_json; | |||||||
| #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) | #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) | ||||||
| #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) | #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) | ||||||
|  |  | ||||||
| // |  | ||||||
| // parallel |  | ||||||
| // |  | ||||||
|  |  | ||||||
| enum server_state { | enum server_state { | ||||||
|     SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet |     SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet | ||||||
|     SERVER_STATE_READY,          // Server is ready and model is loaded |     SERVER_STATE_READY,          // Server is ready and model is loaded | ||||||
| @@ -78,51 +74,8 @@ struct task_multi { | |||||||
|     std::vector<task_result> results{}; |     std::vector<task_result> results{}; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // TODO: can become bool if we can't find use of more states |  | ||||||
| enum slot_state |  | ||||||
| { |  | ||||||
|     IDLE, |  | ||||||
|     PROCESSING, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| enum slot_command |  | ||||||
| { |  | ||||||
|     NONE, |  | ||||||
|     LOAD_PROMPT, |  | ||||||
|     RELEASE, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| struct slot_params |  | ||||||
| { |  | ||||||
|     bool stream       = true; |  | ||||||
|     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt |  | ||||||
|  |  | ||||||
|     uint32_t seed      = -1; // RNG seed |  | ||||||
|     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt |  | ||||||
|     int32_t  n_predict = -1; // new tokens to predict |  | ||||||
|  |  | ||||||
|     std::vector<std::string> antiprompt; |  | ||||||
|  |  | ||||||
|     json input_prefix; |  | ||||||
|     json input_suffix; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| struct slot_image |  | ||||||
| { |  | ||||||
|     int32_t id; |  | ||||||
|  |  | ||||||
|     bool request_encode_image = false; |  | ||||||
|     float * image_embedding = nullptr; |  | ||||||
|     int32_t image_tokens = 0; |  | ||||||
|  |  | ||||||
|     clip_image_u8 * img_data; |  | ||||||
|  |  | ||||||
|     std::string prefix_prompt; // before of this image |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| // completion token output with probabilities | // completion token output with probabilities | ||||||
| struct completion_token_output | struct completion_token_output { | ||||||
| { |  | ||||||
|     struct token_prob |     struct token_prob | ||||||
|     { |     { | ||||||
|         llama_token tok; |         llama_token tok; | ||||||
| @@ -134,8 +87,13 @@ struct completion_token_output | |||||||
|     std::string text_to_send; |     std::string text_to_send; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) | struct token_translator { | ||||||
| { |     llama_context * ctx; | ||||||
|  |     std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); } | ||||||
|  |     std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { | ||||||
|     std::stringstream ss_tid; |     std::stringstream ss_tid; | ||||||
|     ss_tid << std::this_thread::get_id(); |     ss_tid << std::this_thread::get_id(); | ||||||
|     json log = nlohmann::ordered_json{ |     json log = nlohmann::ordered_json{ | ||||||
| @@ -183,8 +141,7 @@ static inline void server_log(const char *level, const char *function, int line, | |||||||
| // | // | ||||||
|  |  | ||||||
| template <typename T> | template <typename T> | ||||||
| static T json_value(const json &body, const std::string &key, const T &default_value) | static T json_value(const json &body, const std::string &key, const T &default_value) { | ||||||
| { |  | ||||||
|     // Fallback null to default value |     // Fallback null to default value | ||||||
|     return body.contains(key) && !body.at(key).is_null() |     return body.contains(key) && !body.at(key).is_null() | ||||||
|         ? body.value(key, default_value) |         ? body.value(key, default_value) | ||||||
| @@ -200,8 +157,7 @@ inline bool verify_custom_template(const std::string & tmpl) { | |||||||
| } | } | ||||||
|  |  | ||||||
| // Format given chat. If tmpl is empty, we take the template from model metadata | // Format given chat. If tmpl is empty, we take the template from model metadata | ||||||
| inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) | inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) { | ||||||
| { |  | ||||||
|     size_t alloc_size = 0; |     size_t alloc_size = 0; | ||||||
|     // vector holding all allocated string to be passed to llama_chat_apply_template |     // vector holding all allocated string to be passed to llama_chat_apply_template | ||||||
|     std::vector<std::string> str(messages.size() * 2); |     std::vector<std::string> str(messages.size() * 2); | ||||||
| @@ -250,7 +206,7 @@ struct llama_server_queue { | |||||||
|     // callback functions |     // callback functions | ||||||
|     std::function<void(task_server&)> callback_new_task; |     std::function<void(task_server&)> callback_new_task; | ||||||
|     std::function<void(task_multi&)> callback_finish_multitask; |     std::function<void(task_multi&)> callback_finish_multitask; | ||||||
|     std::function<void(void)> callback_all_task_finished; |     std::function<void(void)> callback_run_slots; | ||||||
|  |  | ||||||
|     // Add a new task to the end of the queue |     // Add a new task to the end of the queue | ||||||
|     int post(task_server task) { |     int post(task_server task) { | ||||||
| @@ -283,14 +239,14 @@ struct llama_server_queue { | |||||||
|         callback_new_task = callback; |         callback_new_task = callback; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Register function to process a multitask |     // Register function to process a multitask when it is finished | ||||||
|     void on_finish_multitask(std::function<void(task_multi&)> callback) { |     void on_finish_multitask(std::function<void(task_multi&)> callback) { | ||||||
|         callback_finish_multitask = callback; |         callback_finish_multitask = callback; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Register the function to be called when the batch of tasks is finished |     // Register the function to be called when all slots data is ready to be processed | ||||||
|     void on_all_tasks_finished(std::function<void(void)> callback) { |     void on_run_slots(std::function<void(void)> callback) { | ||||||
|         callback_all_task_finished = callback; |         callback_run_slots = callback; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Call when the state of one slot is changed |     // Call when the state of one slot is changed | ||||||
| @@ -312,7 +268,13 @@ struct llama_server_queue { | |||||||
|         condition_tasks.notify_all(); |         condition_tasks.notify_all(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Start the main loop. |     /** | ||||||
|  |      * Main loop consists of these steps: | ||||||
|  |      * - Wait until a new task arrives | ||||||
|  |      * - Process the task (i.e. maybe copy data into slot) | ||||||
|  |      * - Check if multitask is finished | ||||||
|  |      * - Run all slots | ||||||
|  |      */ | ||||||
|     void start_loop() { |     void start_loop() { | ||||||
|         running = true; |         running = true; | ||||||
|         while (true) { |         while (true) { | ||||||
| @@ -331,8 +293,8 @@ struct llama_server_queue { | |||||||
|                     LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); |                     LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); | ||||||
|                     callback_new_task(task); |                     callback_new_task(task); | ||||||
|                 } |                 } | ||||||
|                 LOG_VERBOSE("callback_all_task_finished", {}); |                 LOG_VERBOSE("update_multitasks", {}); | ||||||
|                 // process and update all the multitasks |                 // check if we have any finished multitasks | ||||||
|                 auto queue_iterator = queue_multitasks.begin(); |                 auto queue_iterator = queue_multitasks.begin(); | ||||||
|                 while (queue_iterator != queue_multitasks.end()) |                 while (queue_iterator != queue_multitasks.end()) | ||||||
|                 { |                 { | ||||||
| @@ -349,8 +311,9 @@ struct llama_server_queue { | |||||||
|                         ++queue_iterator; |                         ++queue_iterator; | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 // all tasks in the current loop is finished |                 // all tasks in the current loop is processed, slots data is now ready | ||||||
|                 callback_all_task_finished(); |                 LOG_VERBOSE("callback_run_slots", {}); | ||||||
|  |                 callback_run_slots(); | ||||||
|             } |             } | ||||||
|             LOG_VERBOSE("wait for new task", {}); |             LOG_VERBOSE("wait for new task", {}); | ||||||
|             // wait for new task |             // wait for new task | ||||||
| @@ -408,12 +371,14 @@ struct llama_server_response { | |||||||
|     std::mutex mutex_results; |     std::mutex mutex_results; | ||||||
|     std::condition_variable condition_results; |     std::condition_variable condition_results; | ||||||
|  |  | ||||||
|  |     // add the task_id to the list of tasks waiting for response | ||||||
|     void add_waiting_task_id(int task_id) { |     void add_waiting_task_id(int task_id) { | ||||||
|         LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); |         LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); | ||||||
|         std::unique_lock<std::mutex> lock(mutex_results); |         std::unique_lock<std::mutex> lock(mutex_results); | ||||||
|         waiting_task_ids.insert(task_id); |         waiting_task_ids.insert(task_id); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // when the request is finished, we can remove task associated with it | ||||||
|     void remove_waiting_task_id(int task_id) { |     void remove_waiting_task_id(int task_id) { | ||||||
|         LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); |         LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); | ||||||
|         std::unique_lock<std::mutex> lock(mutex_results); |         std::unique_lock<std::mutex> lock(mutex_results); | ||||||
| @@ -574,3 +539,96 @@ static std::string gen_chatcmplid() | |||||||
|     chatcmplid << "chatcmpl-" << random_string(); |     chatcmplid << "chatcmpl-" << random_string(); | ||||||
|     return chatcmplid.str(); |     return chatcmplid.str(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // | ||||||
|  | // other common utils | ||||||
|  | // | ||||||
|  |  | ||||||
|  | static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b) | ||||||
|  | { | ||||||
|  |     size_t i; | ||||||
|  |     for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) | ||||||
|  |     { | ||||||
|  |     } | ||||||
|  |     return i; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static bool ends_with(const std::string &str, const std::string &suffix) | ||||||
|  | { | ||||||
|  |     return str.size() >= suffix.size() && | ||||||
|  |            0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static size_t find_partial_stop_string(const std::string &stop, | ||||||
|  |                                        const std::string &text) | ||||||
|  | { | ||||||
|  |     if (!text.empty() && !stop.empty()) | ||||||
|  |     { | ||||||
|  |         const char text_last_char = text.back(); | ||||||
|  |         for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) | ||||||
|  |         { | ||||||
|  |             if (stop[char_index] == text_last_char) | ||||||
|  |             { | ||||||
|  |                 const std::string current_partial = stop.substr(0, char_index + 1); | ||||||
|  |                 if (ends_with(text, current_partial)) | ||||||
|  |                 { | ||||||
|  |                     return text.size() - char_index - 1; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     return std::string::npos; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // TODO: reuse llama_detokenize | ||||||
|  | template <class Iter> | ||||||
|  | static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) | ||||||
|  | { | ||||||
|  |     std::string ret; | ||||||
|  |     for (; begin != end; ++begin) | ||||||
|  |     { | ||||||
|  |         ret += llama_token_to_piece(ctx, *begin); | ||||||
|  |     } | ||||||
|  |     return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // format incomplete utf-8 multibyte character for output | ||||||
|  | static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) | ||||||
|  | { | ||||||
|  |     std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); | ||||||
|  |     // if the size is 1 and first bit is 1, meaning it's a partial character | ||||||
|  |     //   (size > 1 meaning it's already a known token) | ||||||
|  |     if (out.size() == 1 && (out[0] & 0x80) == 0x80) | ||||||
|  |     { | ||||||
|  |         std::stringstream ss; | ||||||
|  |         ss << std::hex << (out[0] & 0xff); | ||||||
|  |         std::string res(ss.str()); | ||||||
|  |         out = "byte: \\x" + res; | ||||||
|  |     } | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // convert a vector of completion_token_output to json | ||||||
|  | static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs) | ||||||
|  | { | ||||||
|  |     json out = json::array(); | ||||||
|  |     for (const auto &prob : probs) | ||||||
|  |     { | ||||||
|  |         json probs_for_token = json::array(); | ||||||
|  |         for (const auto &p : prob.probs) | ||||||
|  |         { | ||||||
|  |             std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); | ||||||
|  |             probs_for_token.push_back(json | ||||||
|  |             { | ||||||
|  |                 {"tok_str", tok_str}, | ||||||
|  |                 {"prob",    p.prob}, | ||||||
|  |             }); | ||||||
|  |         } | ||||||
|  |         std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); | ||||||
|  |         out.push_back(json{ | ||||||
|  |             {"content", tok_str}, | ||||||
|  |             {"probs",   probs_for_token}, | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen