mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server: logs - unified format and --log-format option (#5700)
* server: logs - always use JSON logger, add add thread_id in message, log task_id and slot_id * server : skip GH copilot requests from logging * server : change message format of server_log() * server : no need to repeat log in comment * server : log style consistency * server : fix compile warning * server : fix tests regex patterns on M2 Ultra * server: logs: PR feedback on log level * server: logs: allow to choose log format in json or plain text * server: tests: output server logs in text * server: logs switch init logs to server logs macro * server: logs ensure value json value does not raised error * server: logs reduce level VERBOSE to VERB to max 4 chars * server: logs lower case as other log messages * server: logs avoid static in general Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server: logs PR feedback: change text log format to: LEVEL [function_name] message | additional=data --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -39,10 +39,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 | ||||
| - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. | ||||
| - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` | ||||
| - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` | ||||
| - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) | ||||
| - `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1) | ||||
| - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. | ||||
| - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) | ||||
| - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) | ||||
| - `--log-disable`: Output logs to stdout only, default: enabled. | ||||
| - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) | ||||
|  | ||||
| ## Build | ||||
|  | ||||
|   | ||||
| @@ -47,6 +47,7 @@ struct server_params | ||||
| }; | ||||
|  | ||||
| bool server_verbose = false; | ||||
| bool server_log_json = true; | ||||
|  | ||||
| static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b) | ||||
| { | ||||
| @@ -302,12 +303,43 @@ struct llama_client_slot | ||||
|     } | ||||
|  | ||||
|     void print_timings() const { | ||||
|         LOG_TEE("\n"); | ||||
|         LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", | ||||
|             __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); | ||||
|         LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n", | ||||
|             __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); | ||||
|         LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); | ||||
|        char buffer[512]; | ||||
|         double t_token = t_prompt_processing / num_prompt_tokens_processed; | ||||
|         double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; | ||||
|         sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", | ||||
|                 t_prompt_processing, num_prompt_tokens_processed, | ||||
|                 t_token, n_tokens_second); | ||||
|         LOG_INFO(buffer, { | ||||
|             {"slot_id",                     id}, | ||||
|             {"task_id",                     task_id}, | ||||
|             {"t_prompt_processing",         t_prompt_processing}, | ||||
|             {"num_prompt_tokens_processed", num_prompt_tokens_processed}, | ||||
|             {"t_token",                     t_token}, | ||||
|             {"n_tokens_second",             n_tokens_second}, | ||||
|         }); | ||||
|  | ||||
|         t_token = t_token_generation / n_decoded; | ||||
|         n_tokens_second = 1e3 / t_token_generation * n_decoded; | ||||
|         sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)", | ||||
|                 t_token_generation, n_decoded, | ||||
|                 t_token, n_tokens_second); | ||||
|         LOG_INFO(buffer, { | ||||
|             {"slot_id",            id}, | ||||
|             {"task_id",            task_id}, | ||||
|             {"t_token_generation", t_token_generation}, | ||||
|             {"n_decoded",          n_decoded}, | ||||
|             {"t_token",            t_token}, | ||||
|             {"n_tokens_second",    n_tokens_second}, | ||||
|         }); | ||||
|  | ||||
|         sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation); | ||||
|         LOG_INFO(buffer, { | ||||
|             {"slot_id",             id}, | ||||
|             {"task_id",             task_id}, | ||||
|             {"t_prompt_processing", t_prompt_processing}, | ||||
|             {"t_token_generation",  t_token_generation}, | ||||
|             {"t_total",             t_prompt_processing + t_token_generation}, | ||||
|         }); | ||||
|     } | ||||
| }; | ||||
|  | ||||
| @@ -399,7 +431,7 @@ struct llama_server_context | ||||
|         params = params_; | ||||
|         if (!params.mmproj.empty()) { | ||||
|             multimodal = true; | ||||
|             LOG_TEE("Multi Modal Mode Enabled"); | ||||
|             LOG_INFO("Multi Modal Mode Enabled", {}); | ||||
|             clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); | ||||
|             if(clp_ctx == nullptr) { | ||||
|                 LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); | ||||
| @@ -452,7 +484,7 @@ struct llama_server_context | ||||
|  | ||||
|         const int32_t n_ctx_slot = n_ctx / params.n_parallel; | ||||
|  | ||||
|         LOG_TEE("Available slots:\n"); | ||||
|         LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); | ||||
|         for (int i = 0; i < params.n_parallel; i++) | ||||
|         { | ||||
|             llama_client_slot slot; | ||||
| @@ -461,7 +493,10 @@ struct llama_server_context | ||||
|             slot.n_ctx = n_ctx_slot; | ||||
|             slot.n_predict = params.n_predict; | ||||
|  | ||||
|             LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); | ||||
|             LOG_INFO("new slot", { | ||||
|                 {"slot_id",    slot.id}, | ||||
|                 {"n_ctx_slot", slot.n_ctx} | ||||
|             }); | ||||
|  | ||||
|             const int ga_n = params.grp_attn_n; | ||||
|             const int ga_w = params.grp_attn_w; | ||||
| @@ -471,7 +506,12 @@ struct llama_server_context | ||||
|                 GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT | ||||
|                 //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT | ||||
|                 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT | ||||
|                 LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); | ||||
|  | ||||
|                 LOG_INFO("slot self-extend", { | ||||
|                     {"slot_id",   slot.id}, | ||||
|                     {"ga_n",      ga_n}, | ||||
|                     {"ga_w",      ga_w} | ||||
|                 }); | ||||
|             } | ||||
|  | ||||
|             slot.ga_i = 0; | ||||
| @@ -765,10 +805,16 @@ struct llama_server_context | ||||
|                     img_sl.img_data = clip_image_u8_init(); | ||||
|                     if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) | ||||
|                     { | ||||
|                         LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); | ||||
|                         LOG_ERROR("failed to load image", { | ||||
|                             {"slot_id",   slot->id}, | ||||
|                             {"img_sl_id", img_sl.id} | ||||
|                         }); | ||||
|                         return false; | ||||
|                     } | ||||
|                     LOG_TEE("slot %i - loaded image\n", slot->id); | ||||
|                     LOG_VERBOSE("image loaded", { | ||||
|                         {"slot_id",   slot->id}, | ||||
|                         {"img_sl_id", img_sl.id} | ||||
|                     }); | ||||
|                     img_sl.request_encode_image = true; | ||||
|                     slot->images.push_back(img_sl); | ||||
|                 } | ||||
| @@ -828,7 +874,10 @@ struct llama_server_context | ||||
|  | ||||
|         all_slots_are_idle = false; | ||||
|  | ||||
|         LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); | ||||
|         LOG_INFO("slot is processing task", { | ||||
|             {"slot_id", slot->id}, | ||||
|             {"task_id", slot->task_id}, | ||||
|         }); | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
| @@ -1391,7 +1440,7 @@ struct llama_server_context | ||||
|                 if (slot == nullptr) | ||||
|                 { | ||||
|                     // if no slot is available, we defer this task for processing later | ||||
|                     LOG_VERBOSE("no slot is available", {}); | ||||
|                     LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); | ||||
|                     queue_tasks.defer(task); | ||||
|                     break; | ||||
|                 } | ||||
| @@ -1467,7 +1516,17 @@ struct llama_server_context | ||||
|                     } | ||||
|                     slots_data.push_back(slot_data); | ||||
|                 } | ||||
|                 LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); | ||||
|                 LOG_INFO("slot data", { | ||||
|                     {"task_id",            task.id}, | ||||
|                     {"n_idle_slots",       n_idle_slots}, | ||||
|                     {"n_processing_slots", n_processing_slots} | ||||
|                 }); | ||||
|                 LOG_VERBOSE("slot data", { | ||||
|                     {"task_id",            task.id}, | ||||
|                     {"n_idle_slots",       n_idle_slots}, | ||||
|                     {"n_processing_slots", n_processing_slots}, | ||||
|                     {"slots",              slots_data} | ||||
|                 }); | ||||
|                 task_result res; | ||||
|                 res.id = task.id; | ||||
|                 res.multitask_id = task.multitask_id; | ||||
| @@ -1519,7 +1578,7 @@ struct llama_server_context | ||||
|     bool update_slots() { | ||||
|         if (system_need_update) | ||||
|         { | ||||
|             LOG_TEE("updating system prompt\n"); | ||||
|             LOG_INFO("updating system prompt", {}); | ||||
|             update_system_prompt(); | ||||
|         } | ||||
|  | ||||
| @@ -1529,12 +1588,13 @@ struct llama_server_context | ||||
|         { | ||||
|             if (system_prompt.empty() && clean_kv_cache) | ||||
|             { | ||||
|                 LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); | ||||
|                 LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); | ||||
|                 kv_cache_clear(); | ||||
|             } | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
|         LOG_VERBOSE("posting NEXT_RESPONSE", {}); | ||||
|         task_server task; | ||||
|         task.type = TASK_TYPE_NEXT_RESPONSE; | ||||
|         task.target_id = -1; | ||||
| @@ -1548,10 +1608,20 @@ struct llama_server_context | ||||
|                 { | ||||
|                     // Shift context | ||||
|                     const int n_keep    = slot.params.n_keep + add_bos_token; | ||||
|                     const int n_left    = system_tokens.size() + slot.n_past - n_keep; | ||||
|                     const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep; | ||||
|                     const int n_discard = n_left / 2; | ||||
|  | ||||
|                     LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); | ||||
|                     LOG_INFO("slot context shift", { | ||||
|                         {"slot_id",         slot.id}, | ||||
|                         {"task_id",         slot.task_id}, | ||||
|                         {"n_keep",          n_keep}, | ||||
|                         {"n_left",          n_left}, | ||||
|                         {"n_discard",       n_discard}, | ||||
|                         {"n_ctx",           n_ctx}, | ||||
|                         {"n_past",          slot.n_past}, | ||||
|                         {"n_system_tokens", system_tokens.size()}, | ||||
|                         {"n_cache_tokens",  slot.cache_tokens.size()} | ||||
|                     }); | ||||
|                     llama_kv_cache_seq_rm   (ctx, slot.id, n_keep            , n_keep + n_discard); | ||||
|                     llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); | ||||
|  | ||||
| @@ -1565,17 +1635,12 @@ struct llama_server_context | ||||
|                     slot.n_past -= n_discard; | ||||
|  | ||||
|                     slot.truncated = true; | ||||
|  | ||||
|                     LOG_VERBOSE("context shift", { | ||||
|                         { "n_ctx", n_ctx }, | ||||
|                         { "n_keep", n_keep }, | ||||
|                         { "n_left", n_left }, | ||||
|                     }); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // decode any currently ongoing sequences | ||||
|         LOG_VERBOSE("decoding ongoing sequences", {}); | ||||
|         for (auto & slot : slots) | ||||
|         { | ||||
|             // release the slot | ||||
| @@ -1585,7 +1650,15 @@ struct llama_server_context | ||||
|                 slot.command = NONE; | ||||
|                 slot.t_last_used = ggml_time_us(); | ||||
|  | ||||
|                 LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); | ||||
|                 LOG_INFO("slot released", { | ||||
|                     {"slot_id",         slot.id}, | ||||
|                     {"task_id",         slot.task_id}, | ||||
|                     {"n_ctx",           n_ctx}, | ||||
|                     {"n_past",          slot.n_past}, | ||||
|                     {"n_system_tokens", system_tokens.size()}, | ||||
|                     {"n_cache_tokens",  slot.cache_tokens.size()}, | ||||
|                     {"truncated",       slot.truncated} | ||||
|                 }); | ||||
|                 queue_tasks.notify_slot_changed(); | ||||
|  | ||||
|                 continue; | ||||
| @@ -1733,7 +1806,12 @@ struct llama_server_context | ||||
|                             slot.ga_i = ga_i; | ||||
|                         } | ||||
|  | ||||
|                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); | ||||
|                         LOG_INFO("slot progression", { | ||||
|                             { "slot_id", slot.id }, | ||||
|                             { "task_id", slot.task_id }, | ||||
|                             { "n_past",  slot.n_past }, | ||||
|                             { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } | ||||
|                         }); | ||||
|                     } | ||||
|  | ||||
|                     slot.cache_tokens = prompt_tokens; | ||||
| @@ -1741,7 +1819,10 @@ struct llama_server_context | ||||
|                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) | ||||
|                     { | ||||
|                         // we have to evaluate at least 1 token to generate logits. | ||||
|                         LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); | ||||
|                         LOG_INFO("we have to evaluate at least 1 token to generate logits", { | ||||
|                             { "slot_id", slot.id }, | ||||
|                             { "task_id", slot.task_id } | ||||
|                         }); | ||||
|                         slot.n_past--; | ||||
|                         if (slot.ga_i > 0) | ||||
|                         { | ||||
| @@ -1749,9 +1830,13 @@ struct llama_server_context | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); | ||||
|  | ||||
|                     llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); | ||||
|                     int p0 = (int) system_tokens.size() + slot.n_past; | ||||
|                     LOG_INFO("kv cache rm [p0, end)", { | ||||
|                         { "slot_id", slot.id }, | ||||
|                         { "task_id", slot.task_id }, | ||||
|                         { "p0",      p0 } | ||||
|                     }); | ||||
|                     llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); | ||||
|  | ||||
|                     LOG_VERBOSE("prompt ingested", { | ||||
|                                                     {"n_past",  slot.n_past}, | ||||
| @@ -1786,7 +1871,13 @@ struct llama_server_context | ||||
|  | ||||
|                     if (has_images && !ingest_images(slot, n_batch)) | ||||
|                     { | ||||
|                         LOG_TEE("failed processing images\n"); | ||||
|                         LOG_ERROR("failed processing images", { | ||||
|                             "slot_id", slot.id, | ||||
|                             "task_id", slot.task_id, | ||||
|                         }); | ||||
|                         // FIXME @phymbert: to be properly tested | ||||
|                         //  early returning without changing the slot state will block the slot for ever | ||||
|                         // no one at the moment is checking the return value | ||||
|                         return false; | ||||
|                     } | ||||
|  | ||||
| @@ -1928,6 +2019,8 @@ struct llama_server_context | ||||
|                 slot.i_batch = -1; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         LOG_VERBOSE("slots updated", {}); | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
| @@ -2005,6 +2098,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||
|     printf("  -ctv TYPE, --cache-type-v TYPE\n"); | ||||
|     printf("                            KV cache data type for V (default: f16)\n"); | ||||
|     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n"); | ||||
|     printf("  --log-format              log output format: json or text (default: json)\n"); | ||||
|     printf("  --log-disable             disables logging to a file.\n"); | ||||
|     printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n"); | ||||
|     printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); | ||||
| @@ -2458,6 +2552,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, | ||||
|             } | ||||
|             params.mmproj = argv[i]; | ||||
|         } | ||||
|         else if (arg == "--log-format") | ||||
|         { | ||||
|             if (++i >= argc) | ||||
|             { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             if (std::strcmp(argv[i], "json") == 0) | ||||
|             { | ||||
|                 server_log_json = true; | ||||
|             } | ||||
|             else if (std::strcmp(argv[i], "text") == 0) | ||||
|             { | ||||
|                 server_log_json = false; | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|         else if (arg == "--log-disable") | ||||
|         { | ||||
|             log_set_target(stdout); | ||||
| @@ -2571,32 +2686,40 @@ static json format_partial_response( | ||||
|  | ||||
| static json format_tokenizer_response(const std::vector<llama_token> &tokens) | ||||
| { | ||||
|     return json{ | ||||
|         {"tokens", tokens}}; | ||||
|     return json { | ||||
|         {"tokens", tokens} | ||||
|     }; | ||||
| } | ||||
|  | ||||
| static json format_detokenized_response(std::string content) | ||||
| { | ||||
|     return json{ | ||||
|         {"content", content}}; | ||||
|     return json { | ||||
|         {"content", content} | ||||
|     }; | ||||
| } | ||||
|  | ||||
|  | ||||
| static void log_server_request(const httplib::Request &req, const httplib::Response &res) | ||||
| { | ||||
|     // skip GH copilot requests when using default port | ||||
|     if (req.path == "/v1/health" || req.path == "/v1/completions") | ||||
|     { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     LOG_INFO("request", { | ||||
|                             {"remote_addr", req.remote_addr}, | ||||
|                             {"remote_port", req.remote_port}, | ||||
|                             {"status", res.status}, | ||||
|                             {"method", req.method}, | ||||
|                             {"path", req.path}, | ||||
|                             {"params", req.params}, | ||||
|                         }); | ||||
|         {"remote_addr", req.remote_addr}, | ||||
|         {"remote_port", req.remote_port}, | ||||
|         {"status",      res.status}, | ||||
|         {"method",      req.method}, | ||||
|         {"path",        req.path}, | ||||
|         {"params",      req.params}, | ||||
|     }); | ||||
|  | ||||
|     LOG_VERBOSE("request", { | ||||
|                                {"request", req.body}, | ||||
|                                {"response", res.body}, | ||||
|                            }); | ||||
|         {"request",  req.body}, | ||||
|         {"response", res.body}, | ||||
|     }); | ||||
| } | ||||
|  | ||||
| struct token_translator | ||||
| @@ -2873,9 +2996,6 @@ int main(int argc, char **argv) | ||||
|     // Set the base directory for serving static files | ||||
|     svr.set_base_dir(sparams.public_path); | ||||
|  | ||||
|     // to make it ctrl+clickable: | ||||
|     LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); | ||||
|  | ||||
|     std::unordered_map<std::string, std::string> log_data; | ||||
|     log_data["hostname"] = sparams.hostname; | ||||
|     log_data["port"] = std::to_string(sparams.port); | ||||
|   | ||||
| @@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables: | ||||
|  - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` | ||||
|  - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` | ||||
|  - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` | ||||
|  - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format | ||||
|  | ||||
| ### Run @bug, @wip or @wrong_usage annotated scenario | ||||
|  | ||||
|   | ||||
| @@ -29,9 +29,9 @@ Feature: llama.cpp server | ||||
|     And   prometheus metrics are exposed | ||||
|  | ||||
|     Examples: Prompts | ||||
|       | prompt                           | n_predict | re_content                   | n_predicted | | ||||
|       | I believe the meaning of life is | 8         | read                         | 8           | | ||||
|       | Write a joke about AI            | 64        | (park<or>friends<or>scared)+ | 32          | | ||||
|       | prompt                           | n_predict | re_content                             | n_predicted | | ||||
|       | I believe the meaning of life is | 8         | (read<or>going)+                       | 8           | | ||||
|       | Write a joke about AI            | 64        | (park<or>friends<or>scared<or>always)+ | 32          | | ||||
|  | ||||
|   Scenario Outline: OAI Compatibility | ||||
|     Given a model <model> | ||||
|   | ||||
| @@ -792,6 +792,8 @@ def start_server_background(context): | ||||
|         server_args.extend(['--api-key', context.server_api_key]) | ||||
|     if context.debug: | ||||
|         server_args.append('--verbose') | ||||
|     if 'SERVER_LOG_FORMAT_JSON' not in os.environ: | ||||
|         server_args.extend(['--log-format', "text"]) | ||||
|     print(f"starting server with: {context.server_path}", *server_args) | ||||
|     context.server_process = subprocess.Popen( | ||||
|         [str(arg) for arg in [context.server_path, *server_args]], | ||||
|   | ||||
| @@ -14,6 +14,7 @@ | ||||
| using json = nlohmann::json; | ||||
|  | ||||
| extern bool server_verbose; | ||||
| extern bool server_log_json; | ||||
|  | ||||
| #ifndef SERVER_VERBOSE | ||||
| #define SERVER_VERBOSE 1 | ||||
| @@ -27,14 +28,14 @@ extern bool server_verbose; | ||||
|     {                                                                    \ | ||||
|         if (server_verbose)                                              \ | ||||
|         {                                                                \ | ||||
|             server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ | ||||
|             server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ | ||||
|         }                                                                \ | ||||
|     } while (0) | ||||
| #endif | ||||
|  | ||||
| #define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__) | ||||
| #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) | ||||
| #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__) | ||||
| #define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__) | ||||
| #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) | ||||
| #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) | ||||
|  | ||||
| // | ||||
| // parallel | ||||
| @@ -133,26 +134,48 @@ struct completion_token_output | ||||
|     std::string text_to_send; | ||||
| }; | ||||
|  | ||||
| static inline void server_log(const char *level, const char *function, int line, | ||||
|                        const char *message, const nlohmann::ordered_json &extra) | ||||
| static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) | ||||
| { | ||||
|     nlohmann::ordered_json log | ||||
|     { | ||||
|     std::stringstream ss_tid; | ||||
|     ss_tid << std::this_thread::get_id(); | ||||
|     json log = nlohmann::ordered_json{ | ||||
|         {"tid", ss_tid.str()}, | ||||
|         {"timestamp", time(nullptr)}, | ||||
|         {"level",     level}, | ||||
|         {"function",  function}, | ||||
|         {"line",      line}, | ||||
|         {"message",   message}, | ||||
|     }; | ||||
|  | ||||
|     if (!extra.empty()) | ||||
|     { | ||||
|         log.merge_patch(extra); | ||||
|     } | ||||
|     if (server_log_json) { | ||||
|         log.merge_patch( | ||||
|                 { | ||||
|                         {"level",     level}, | ||||
|                         {"function",  function}, | ||||
|                         {"line",      line}, | ||||
|                         {"msg",       message}, | ||||
|                 }); | ||||
|         if (!extra.empty()) { | ||||
|             log.merge_patch(extra); | ||||
|         } | ||||
|  | ||||
|     const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); | ||||
|     printf("%.*s\n", (int)str.size(), str.data()); | ||||
|     fflush(stdout); | ||||
|         std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; | ||||
|     } else { | ||||
|         char buf[1024]; | ||||
|         snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); | ||||
|  | ||||
|         if (!extra.empty()) { | ||||
|             log.merge_patch(extra); | ||||
|         } | ||||
|         std::stringstream ss; | ||||
|         ss << buf << " |"; | ||||
|         for (const auto& el : log.items()) | ||||
|         { | ||||
|             const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); | ||||
|             snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); | ||||
|             ss << buf; | ||||
|         } | ||||
|  | ||||
|         const std::string str = ss.str(); | ||||
|         printf("%.*s\n", (int)str.size(), str.data()); | ||||
|         fflush(stdout); | ||||
|     } | ||||
| } | ||||
|  | ||||
| // | ||||
| @@ -234,6 +257,7 @@ struct llama_server_queue { | ||||
|         std::unique_lock<std::mutex> lock(mutex_tasks); | ||||
|         if (task.id == -1) { | ||||
|             task.id = id++; | ||||
|             LOG_VERBOSE("new task id", {{"new_id", task.id}}); | ||||
|         } | ||||
|         queue_tasks.push_back(std::move(task)); | ||||
|         condition_tasks.notify_one(); | ||||
| @@ -249,7 +273,9 @@ struct llama_server_queue { | ||||
|     // Get the next id for creating anew task | ||||
|     int get_new_id() { | ||||
|         std::unique_lock<std::mutex> lock(mutex_tasks); | ||||
|         return id++; | ||||
|         int new_id = id++; | ||||
|         LOG_VERBOSE("new task id", {{"new_id", new_id}}); | ||||
|         return new_id; | ||||
|     } | ||||
|  | ||||
|     // Register function to process a new task | ||||
| @@ -290,8 +316,7 @@ struct llama_server_queue { | ||||
|     void start_loop() { | ||||
|         running = true; | ||||
|         while (true) { | ||||
|             // new task arrived | ||||
|             LOG_VERBOSE("have new task", {}); | ||||
|             LOG_VERBOSE("new task may arrive", {}); | ||||
|             { | ||||
|                 while (true) | ||||
|                 { | ||||
| @@ -303,7 +328,7 @@ struct llama_server_queue { | ||||
|                     task_server task = queue_tasks.front(); | ||||
|                     queue_tasks.erase(queue_tasks.begin()); | ||||
|                     lock.unlock(); | ||||
|                     LOG_VERBOSE("callback_new_task", {}); | ||||
|                     LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); | ||||
|                     callback_new_task(task); | ||||
|                 } | ||||
|                 LOG_VERBOSE("callback_all_task_finished", {}); | ||||
| @@ -384,11 +409,13 @@ struct llama_server_response { | ||||
|     std::condition_variable condition_results; | ||||
|  | ||||
|     void add_waiting_task_id(int task_id) { | ||||
|         LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); | ||||
|         std::unique_lock<std::mutex> lock(mutex_results); | ||||
|         waiting_task_ids.insert(task_id); | ||||
|     } | ||||
|  | ||||
|     void remove_waiting_task_id(int task_id) { | ||||
|         LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); | ||||
|         std::unique_lock<std::mutex> lock(mutex_results); | ||||
|         waiting_task_ids.erase(task_id); | ||||
|     } | ||||
| @@ -401,7 +428,6 @@ struct llama_server_response { | ||||
|             condition_results.wait(lock, [&]{ | ||||
|                 return !queue_results.empty(); | ||||
|             }); | ||||
|             LOG_VERBOSE("condition_results unblock", {}); | ||||
|  | ||||
|             for (int i = 0; i < (int) queue_results.size(); i++) | ||||
|             { | ||||
| @@ -426,20 +452,20 @@ struct llama_server_response { | ||||
|     // Send a new result to a waiting task_id | ||||
|     void send(task_result result) { | ||||
|         std::unique_lock<std::mutex> lock(mutex_results); | ||||
|         LOG_VERBOSE("send new result", {}); | ||||
|         LOG_VERBOSE("send new result", {{"task_id", result.id}}); | ||||
|         for (auto& task_id : waiting_task_ids) { | ||||
|             // LOG_TEE("waiting task id %i \n", task_id); | ||||
|             // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result | ||||
|             if (result.multitask_id == task_id) | ||||
|             { | ||||
|                 LOG_VERBOSE("callback_update_multitask", {}); | ||||
|                 LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}}); | ||||
|                 callback_update_multitask(task_id, result.id, result); | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             if (result.id == task_id) | ||||
|             { | ||||
|                 LOG_VERBOSE("queue_results.push_back", {}); | ||||
|                 LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}}); | ||||
|                 queue_results.push_back(result); | ||||
|                 condition_results.notify_all(); | ||||
|                 return; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Pierrick Hymbert
					Pierrick Hymbert