mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/master' into jinja
This commit is contained in:
		| @@ -67,6 +67,13 @@ enum server_task_type { | ||||
|     SERVER_TASK_TYPE_SET_LORA, | ||||
| }; | ||||
|  | ||||
| enum oaicompat_type { | ||||
|     OAICOMPAT_TYPE_NONE, | ||||
|     OAICOMPAT_TYPE_CHAT, | ||||
|     OAICOMPAT_TYPE_COMPLETION, | ||||
|     OAICOMPAT_TYPE_EMBEDDING, | ||||
| }; | ||||
|  | ||||
| // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 | ||||
| enum error_type { | ||||
|     ERROR_TYPE_INVALID_REQUEST, | ||||
| @@ -91,6 +98,8 @@ struct slot_params { | ||||
|     int64_t t_max_prompt_ms  = -1; // TODO: implement | ||||
|     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit | ||||
|  | ||||
|     std::vector<common_adapter_lora_info> lora; | ||||
|  | ||||
|     std::vector<std::string> antiprompt; | ||||
|     std::vector<std::string> response_fields; | ||||
|     bool timings_per_token = false; | ||||
| @@ -101,11 +110,10 @@ struct slot_params { | ||||
|     struct common_params_speculative speculative; | ||||
|  | ||||
|     // OAI-compat fields | ||||
|     bool        verbose        = false; | ||||
|     bool        oaicompat      = false; | ||||
|     bool        oaicompat_chat = true; | ||||
|     std::string oaicompat_model; | ||||
|     std::string oaicompat_cmpl_id; | ||||
|     bool           verbose        = false; | ||||
|     oaicompat_type oaicompat      = OAICOMPAT_TYPE_NONE; | ||||
|     std::string    oaicompat_model; | ||||
|     std::string    oaicompat_cmpl_id; | ||||
|  | ||||
|     json to_json() const { | ||||
|         std::vector<std::string> samplers; | ||||
| @@ -114,6 +122,11 @@ struct slot_params { | ||||
|             samplers.emplace_back(common_sampler_type_to_str(sampler)); | ||||
|         } | ||||
|  | ||||
|         json lora = json::array(); | ||||
|         for (size_t i = 0; i < this->lora.size(); ++i) { | ||||
|             lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); | ||||
|         } | ||||
|  | ||||
|         return json { | ||||
|             {"n_predict",                 n_predict},     // Server configured n_predict | ||||
|             {"seed",                      sampling.seed}, | ||||
| @@ -154,6 +167,7 @@ struct slot_params { | ||||
|             {"speculative.p_min",         speculative.p_min}, | ||||
|             {"timings_per_token",         timings_per_token}, | ||||
|             {"post_sampling_probs",       post_sampling_probs}, | ||||
|             {"lora",                      lora}, | ||||
|         }; | ||||
|     } | ||||
| }; | ||||
| @@ -183,13 +197,18 @@ struct server_task { | ||||
|     // used by SERVER_TASK_TYPE_METRICS | ||||
|     bool metrics_reset_bucket = false; | ||||
|  | ||||
|     // used by SERVER_TASK_TYPE_SET_LORA | ||||
|     std::vector<common_adapter_lora_info> set_lora; | ||||
|  | ||||
|     server_task(server_task_type type) : type(type) {} | ||||
|  | ||||
|     static slot_params params_from_json_cmpl( | ||||
|             const llama_model * model, | ||||
|             const llama_context * ctx, | ||||
|             const common_params & params_base, | ||||
|             const json & data) { | ||||
|         const llama_model * model = llama_get_model(ctx); | ||||
|         const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|  | ||||
|         slot_params params; | ||||
|  | ||||
|         // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) | ||||
| @@ -245,6 +264,16 @@ struct server_task { | ||||
|         params.speculative.n_min = std::max(params.speculative.n_min, 2); | ||||
|         params.speculative.n_max = std::max(params.speculative.n_max, 0); | ||||
|  | ||||
|         if (data.contains("lora")) { | ||||
|             if (data.at("lora").is_array()) { | ||||
|                 params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora")); | ||||
|             } else { | ||||
|                 throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); | ||||
|             } | ||||
|         } else { | ||||
|             params.lora = params_base.lora_adapters; | ||||
|         } | ||||
|  | ||||
|         // TODO: add more sanity checks for the input parameters | ||||
|  | ||||
|         if (params.sampling.penalty_last_n < -1) { | ||||
| @@ -302,7 +331,7 @@ struct server_task { | ||||
|  | ||||
|             const auto & logit_bias = data.find("logit_bias"); | ||||
|             if (logit_bias != data.end() && logit_bias->is_array()) { | ||||
|                 const int n_vocab = llama_n_vocab(model); | ||||
|                 const int n_vocab = llama_vocab_n_tokens(vocab); | ||||
|                 for (const auto & el : *logit_bias) { | ||||
|                     // TODO: we may want to throw errors here, in case "el" is incorrect | ||||
|                     if (el.is_array() && el.size() == 2) { | ||||
| @@ -321,7 +350,7 @@ struct server_task { | ||||
|                                 params.sampling.logit_bias.push_back({tok, bias}); | ||||
|                             } | ||||
|                         } else if (el[0].is_string()) { | ||||
|                             auto toks = common_tokenize(model, el[0].get<std::string>(), false); | ||||
|                             auto toks = common_tokenize(vocab, el[0].get<std::string>(), false); | ||||
|                             for (auto tok : toks) { | ||||
|                                 params.sampling.logit_bias.push_back({tok, bias}); | ||||
|                             } | ||||
| @@ -529,11 +558,10 @@ struct server_task_result_cmpl_final : server_task_result { | ||||
|     slot_params generation_params; | ||||
|  | ||||
|     // OAI-compat fields | ||||
|     bool        verbose        = false; | ||||
|     bool        oaicompat      = false; | ||||
|     bool        oaicompat_chat = true; // TODO: support oaicompat for non-chat | ||||
|     std::string oaicompat_model; | ||||
|     std::string oaicompat_cmpl_id; | ||||
|     bool           verbose        = false; | ||||
|     oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; | ||||
|     std::string    oaicompat_model; | ||||
|     std::string    oaicompat_cmpl_id; | ||||
|  | ||||
|     virtual int get_index() override { | ||||
|         return index; | ||||
| @@ -544,9 +572,16 @@ struct server_task_result_cmpl_final : server_task_result { | ||||
|     } | ||||
|  | ||||
|     virtual json to_json() override { | ||||
|         return oaicompat | ||||
|             ? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat()) | ||||
|             : to_json_non_oaicompat(); | ||||
|         switch (oaicompat) { | ||||
|             case OAICOMPAT_TYPE_NONE: | ||||
|                 return to_json_non_oaicompat(); | ||||
|             case OAICOMPAT_TYPE_COMPLETION: | ||||
|                 return to_json_oaicompat(); | ||||
|             case OAICOMPAT_TYPE_CHAT: | ||||
|                 return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); | ||||
|             default: | ||||
|                 GGML_ASSERT(false && "Invalid oaicompat_type"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     json to_json_non_oaicompat() { | ||||
| @@ -574,6 +609,50 @@ struct server_task_result_cmpl_final : server_task_result { | ||||
|         return response_fields.empty() ? res : json_get_nested_values(response_fields, res); | ||||
|     } | ||||
|  | ||||
|     json to_json_oaicompat() { | ||||
|         std::time_t t = std::time(0); | ||||
|         json logprobs = json(nullptr); // OAI default to null | ||||
|         if (!stream && probs_output.size() > 0) { | ||||
|             logprobs = json{ | ||||
|                 {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, | ||||
|             }; | ||||
|         } | ||||
|         json finish_reason = "length"; | ||||
|         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { | ||||
|             finish_reason = "stop"; | ||||
|         } | ||||
|         json res = json { | ||||
|             {"choices",            json::array({ | ||||
|                 json{ | ||||
|                     {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk | ||||
|                     {"index",         index}, | ||||
|                     {"logprobs",      logprobs}, | ||||
|                     {"finish_reason", finish_reason}, | ||||
|                 } | ||||
|             })}, | ||||
|             {"created",            t}, | ||||
|             {"model",              oaicompat_model}, | ||||
|             {"system_fingerprint", build_info}, | ||||
|             {"object",             "text_completion"}, | ||||
|             {"usage", json { | ||||
|                 {"completion_tokens", n_decoded}, | ||||
|                 {"prompt_tokens",     n_prompt_tokens}, | ||||
|                 {"total_tokens",      n_decoded + n_prompt_tokens} | ||||
|             }}, | ||||
|             {"id", oaicompat_cmpl_id} | ||||
|         }; | ||||
|  | ||||
|         // extra fields for debugging purposes | ||||
|         if (verbose) { | ||||
|             res["__verbose"] = to_json_non_oaicompat(); | ||||
|         } | ||||
|         if (timings.prompt_n >= 0) { | ||||
|             res.push_back({"timings", timings.to_json()}); | ||||
|         } | ||||
|  | ||||
|         return res; | ||||
|     } | ||||
|  | ||||
|     json to_json_oaicompat_chat() { | ||||
|         std::string finish_reason = "length"; | ||||
|         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { | ||||
| @@ -671,11 +750,10 @@ struct server_task_result_cmpl_partial : server_task_result { | ||||
|     result_timings timings; | ||||
|  | ||||
|     // OAI-compat fields | ||||
|     bool        verbose        = false; | ||||
|     bool        oaicompat      = false; | ||||
|     bool        oaicompat_chat = true; // TODO: support oaicompat for non-chat | ||||
|     std::string oaicompat_model; | ||||
|     std::string oaicompat_cmpl_id; | ||||
|     bool           verbose   = false; | ||||
|     oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; | ||||
|     std::string    oaicompat_model; | ||||
|     std::string    oaicompat_cmpl_id; | ||||
|  | ||||
|     virtual int get_index() override { | ||||
|         return index; | ||||
| @@ -686,7 +764,16 @@ struct server_task_result_cmpl_partial : server_task_result { | ||||
|     } | ||||
|  | ||||
|     virtual json to_json() override { | ||||
|         return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); | ||||
|         switch (oaicompat) { | ||||
|             case OAICOMPAT_TYPE_NONE: | ||||
|                 return to_json_non_oaicompat(); | ||||
|             case OAICOMPAT_TYPE_COMPLETION: | ||||
|                 return to_json_oaicompat(); | ||||
|             case OAICOMPAT_TYPE_CHAT: | ||||
|                 return to_json_oaicompat_chat(); | ||||
|             default: | ||||
|                 GGML_ASSERT(false && "Invalid oaicompat_type"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     json to_json_non_oaicompat() { | ||||
| @@ -711,6 +798,41 @@ struct server_task_result_cmpl_partial : server_task_result { | ||||
|     } | ||||
|  | ||||
|     json to_json_oaicompat() { | ||||
|         std::time_t t = std::time(0); | ||||
|         json logprobs = json(nullptr); // OAI default to null | ||||
|         if (prob_output.probs.size() > 0) { | ||||
|             logprobs = json{ | ||||
|                 {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, | ||||
|             }; | ||||
|         } | ||||
|         json res = json { | ||||
|             {"choices",            json::array({ | ||||
|                 json{ | ||||
|                     {"text",          content}, | ||||
|                     {"index",         index}, | ||||
|                     {"logprobs",      logprobs}, | ||||
|                     {"finish_reason", nullptr}, | ||||
|                 } | ||||
|             })}, | ||||
|             {"created",            t}, | ||||
|             {"model",              oaicompat_model}, | ||||
|             {"system_fingerprint", build_info}, | ||||
|             {"object",             "text_completion"}, | ||||
|             {"id",                 oaicompat_cmpl_id} | ||||
|         }; | ||||
|  | ||||
|         // extra fields for debugging purposes | ||||
|         if (verbose) { | ||||
|             res["__verbose"] = to_json_non_oaicompat(); | ||||
|         } | ||||
|         if (timings.prompt_n >= 0) { | ||||
|             res.push_back({"timings", timings.to_json()}); | ||||
|         } | ||||
|  | ||||
|         return res; | ||||
|     } | ||||
|  | ||||
|     json to_json_oaicompat_chat() { | ||||
|         bool first = n_decoded == 0; | ||||
|         std::time_t t = std::time(0); | ||||
|         json choices; | ||||
| @@ -789,14 +911,16 @@ struct server_task_result_embd : server_task_result { | ||||
|     int32_t n_tokens; | ||||
|  | ||||
|     // OAI-compat fields | ||||
|     bool oaicompat = false; | ||||
|     oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; | ||||
|  | ||||
|     virtual int get_index() override { | ||||
|         return index; | ||||
|     } | ||||
|  | ||||
|     virtual json to_json() override { | ||||
|         return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); | ||||
|         return oaicompat == OAICOMPAT_TYPE_EMBEDDING | ||||
|             ? to_json_oaicompat() | ||||
|             : to_json_non_oaicompat(); | ||||
|     } | ||||
|  | ||||
|     json to_json_non_oaicompat() { | ||||
| @@ -1009,6 +1133,8 @@ struct server_slot { | ||||
|  | ||||
|     common_speculative * spec = nullptr; | ||||
|  | ||||
|     std::vector<common_adapter_lora_info> lora; | ||||
|  | ||||
|     // the index relative to completion multi-task request | ||||
|     size_t index = 0; | ||||
|  | ||||
| @@ -1090,6 +1216,11 @@ struct server_slot { | ||||
|         return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; | ||||
|     } | ||||
|  | ||||
|     bool can_batch_with(server_slot & other_slot) { | ||||
|         return is_non_causal() == other_slot.is_non_causal() | ||||
|             && are_lora_equal(lora, other_slot.lora); | ||||
|     } | ||||
|  | ||||
|     bool has_budget(const common_params & global_params) { | ||||
|         if (params.n_predict == -1 && global_params.n_predict == -1) { | ||||
|             return true; // limitless | ||||
| @@ -1497,11 +1628,17 @@ struct server_response { | ||||
| struct server_context { | ||||
|     common_params params_base; | ||||
|  | ||||
|     // note: keep these alive - they determine the lifetime of the model, context, etc. | ||||
|     common_init_result llama_init; | ||||
|     common_init_result llama_init_dft; | ||||
|  | ||||
|     llama_model * model = nullptr; | ||||
|     llama_context * ctx = nullptr; | ||||
|     std::vector<common_lora_adapter_container> loras; | ||||
|  | ||||
|     const llama_vocab * vocab = nullptr; | ||||
|  | ||||
|     llama_model * model_dft = nullptr; | ||||
|  | ||||
|     llama_context_params cparams_dft; | ||||
|  | ||||
|     llama_batch batch = {}; | ||||
| @@ -1525,21 +1662,6 @@ struct server_context { | ||||
|     float slot_prompt_similarity = 0.0f; | ||||
|  | ||||
|     ~server_context() { | ||||
|         if (ctx) { | ||||
|             llama_free(ctx); | ||||
|             ctx = nullptr; | ||||
|         } | ||||
|  | ||||
|         if (model) { | ||||
|             llama_free_model(model); | ||||
|             model = nullptr; | ||||
|         } | ||||
|  | ||||
|         if (model_dft) { | ||||
|             llama_free_model(model_dft); | ||||
|             model_dft = nullptr; | ||||
|         } | ||||
|  | ||||
|         // Clear any sampling context | ||||
|         for (server_slot & slot : slots) { | ||||
|             common_sampler_free(slot.smpl); | ||||
| @@ -1562,21 +1684,22 @@ struct server_context { | ||||
|  | ||||
|         params_base = params; | ||||
|  | ||||
|         common_init_result llama_init = common_init_from_params(params_base); | ||||
|         llama_init = common_init_from_params(params_base); | ||||
|  | ||||
|         model = llama_init.model; | ||||
|         ctx   = llama_init.context; | ||||
|         loras = llama_init.lora_adapters; | ||||
|         model = llama_init.model.get(); | ||||
|         ctx   = llama_init.context.get(); | ||||
|  | ||||
|         if (model == nullptr) { | ||||
|             SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str()); | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         vocab = llama_model_get_vocab(model); | ||||
|  | ||||
|         n_ctx = llama_n_ctx(ctx); | ||||
|  | ||||
|         add_bos_token = llama_add_bos_token(model); | ||||
|         has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; | ||||
|         add_bos_token = llama_vocab_get_add_bos(vocab); | ||||
|         has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; | ||||
|  | ||||
|         if (!params_base.speculative.model.empty()) { | ||||
|             SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); | ||||
| @@ -1589,25 +1712,22 @@ struct server_context { | ||||
|             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; | ||||
|             params_dft.n_parallel   = 1; | ||||
|  | ||||
|             common_init_result llama_init_dft = common_init_from_params(params_dft); | ||||
|             llama_init_dft = common_init_from_params(params_dft); | ||||
|  | ||||
|             model_dft = llama_init_dft.model; | ||||
|             model_dft = llama_init_dft.model.get(); | ||||
|  | ||||
|             if (model_dft == nullptr) { | ||||
|                 SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str()); | ||||
|                 return false; | ||||
|             } | ||||
|  | ||||
|             if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { | ||||
|             if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) { | ||||
|                 SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str()); | ||||
|  | ||||
|                 llama_free      (llama_init_dft.context); | ||||
|                 llama_free_model(llama_init_dft.model); | ||||
|  | ||||
|                 return false; | ||||
|             } | ||||
|  | ||||
|             const int n_ctx_dft = llama_n_ctx(llama_init_dft.context); | ||||
|             const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); | ||||
|  | ||||
|             cparams_dft = common_context_params_to_llama(params_dft); | ||||
|             cparams_dft.n_batch = n_ctx_dft; | ||||
| @@ -1615,15 +1735,12 @@ struct server_context { | ||||
|             // force F16 KV cache for the draft model for extra performance | ||||
|             cparams_dft.type_k = GGML_TYPE_F16; | ||||
|             cparams_dft.type_v = GGML_TYPE_F16; | ||||
|  | ||||
|             // the context is not needed - we will create one for each slot | ||||
|             llama_free(llama_init_dft.context); | ||||
|         } | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     bool validate_model_chat_template(bool use_jinja) const { | ||||
|     bool validate_builtin_chat_template(bool use_jinja) const { | ||||
|         llama_chat_message chat[] = {{"user", "test"}}; | ||||
|  | ||||
|         if (use_jinja) { | ||||
| @@ -1642,18 +1759,13 @@ struct server_context { | ||||
|                 return true; | ||||
|             } catch (const std::exception & e) { | ||||
|                 SRV_ERR("failed to apply template: %s\n", e.what()); | ||||
|                 return false; | ||||
|             } | ||||
|         } else { | ||||
|             std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes | ||||
|             std::string template_key = "tokenizer.chat_template"; | ||||
|             int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); | ||||
|             if (res >= 0) { | ||||
|                 std::string tmpl = std::string(model_template.data(), model_template.size()); | ||||
|                 int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); | ||||
|                 return chat_res > 0; | ||||
|             } | ||||
|             const char * tmpl = llama_model_chat_template(model, /* name */ nullptr); | ||||
|             const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); | ||||
|             return chat_res > 0; | ||||
|         } | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     void init() { | ||||
| @@ -1672,7 +1784,7 @@ struct server_context { | ||||
|             if (model_dft) { | ||||
|                 slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); | ||||
|  | ||||
|                 slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); | ||||
|                 slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); | ||||
|                 if (slot.ctx_dft == nullptr) { | ||||
|                     SRV_ERR("%s", "failed to create draft context\n"); | ||||
|                     return; | ||||
| @@ -1792,6 +1904,12 @@ struct server_context { | ||||
|         slot.params        = std::move(task.params); | ||||
|         slot.prompt_tokens = std::move(task.prompt_tokens); | ||||
|  | ||||
|         if (!are_lora_equal(task.params.lora, slot.lora)) { | ||||
|             // if lora is changed, we cannot reuse cached tokens | ||||
|             slot.cache_tokens.clear(); | ||||
|             slot.lora = task.params.lora; | ||||
|         } | ||||
|  | ||||
|         SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); | ||||
|  | ||||
|         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { | ||||
| @@ -1801,7 +1919,7 @@ struct server_context { | ||||
|         } | ||||
|  | ||||
|         if (slot.params.ignore_eos && has_eos_token) { | ||||
|             slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); | ||||
|             slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY}); | ||||
|         } | ||||
|  | ||||
|         { | ||||
| @@ -1876,6 +1994,8 @@ struct server_context { | ||||
|                 result.text_to_send = slot.generated_text.substr(pos, std::string::npos); | ||||
|                 slot.n_sent_text += result.text_to_send.size(); | ||||
|                 // add the token to slot queue and cache | ||||
|             } else { | ||||
|                 result.text_to_send = ""; | ||||
|             } | ||||
|  | ||||
|             slot.add_token(result); | ||||
| @@ -1955,14 +2075,14 @@ struct server_context { | ||||
|                     slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); | ||||
|         } | ||||
|  | ||||
|         if (llama_token_is_eog(model, result.tok)) { | ||||
|         if (llama_vocab_is_eog(vocab, result.tok)) { | ||||
|             slot.stop           = STOP_TYPE_EOS; | ||||
|             slot.has_next_token = false; | ||||
|  | ||||
|             SLT_DBG(slot, "%s", "stopped by EOS\n"); | ||||
|         } | ||||
|  | ||||
|         const auto n_ctx_train = llama_n_ctx_train(model); | ||||
|         const auto n_ctx_train = llama_model_n_ctx_train(model); | ||||
|  | ||||
|         if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { | ||||
|             slot.truncated      = true; | ||||
| @@ -1982,7 +2102,7 @@ struct server_context { | ||||
|  | ||||
|     void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { | ||||
|         size_t n_probs = slot.params.sampling.n_probs; | ||||
|         size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); | ||||
|         size_t n_vocab = llama_vocab_n_tokens(vocab); | ||||
|         if (post_sampling) { | ||||
|             const auto * cur_p = common_sampler_get_candidates(slot.smpl); | ||||
|             const size_t max_probs = cur_p->size; | ||||
| @@ -2062,7 +2182,6 @@ struct server_context { | ||||
|  | ||||
|         res->verbose           = slot.params.verbose; | ||||
|         res->oaicompat         = slot.params.oaicompat; | ||||
|         res->oaicompat_chat    = slot.params.oaicompat_chat; | ||||
|         res->oaicompat_model   = slot.params.oaicompat_model; | ||||
|         res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; | ||||
|  | ||||
| @@ -2103,7 +2222,6 @@ struct server_context { | ||||
|         res->verbose           = slot.params.verbose; | ||||
|         res->stream            = slot.params.stream; | ||||
|         res->oaicompat         = slot.params.oaicompat; | ||||
|         res->oaicompat_chat    = slot.params.oaicompat_chat; | ||||
|         res->oaicompat_model   = slot.params.oaicompat_model; | ||||
|         res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; | ||||
|  | ||||
| @@ -2135,7 +2253,7 @@ struct server_context { | ||||
|         res->n_tokens  = slot.n_prompt_tokens; | ||||
|         res->oaicompat = slot.params.oaicompat; | ||||
|  | ||||
|         const int n_embd = llama_n_embd(model); | ||||
|         const int n_embd = llama_model_n_embd(model); | ||||
|  | ||||
|         std::vector<float> embd_res(n_embd, 0.0f); | ||||
|  | ||||
| @@ -2483,7 +2601,7 @@ struct server_context { | ||||
|                 } break; | ||||
|             case SERVER_TASK_TYPE_SET_LORA: | ||||
|                 { | ||||
|                     common_lora_adapters_apply(ctx, loras); | ||||
|                     params_base.lora_adapters = std::move(task.set_lora); | ||||
|                     auto res = std::make_unique<server_task_result_apply_lora>(); | ||||
|                     res->id = task.id; | ||||
|                     queue_results.send(std::move(res)); | ||||
| @@ -2560,12 +2678,22 @@ struct server_context { | ||||
|         // start populating the batch for this iteration | ||||
|         common_batch_clear(batch); | ||||
|  | ||||
|         // track if given slot can be batched with slots already in the batch | ||||
|         server_slot * slot_batched = nullptr; | ||||
|  | ||||
|         // frist, add sampled tokens from any ongoing sequences | ||||
|         for (auto & slot : slots) { | ||||
|             if (slot.state != SLOT_STATE_GENERATING) { | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             // check if we can batch this slot with the previous one | ||||
|             if (!slot_batched) { | ||||
|                 slot_batched = &slot; | ||||
|             } else if (!slot_batched->can_batch_with(slot)) { | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             slot.i_batch = batch.n_tokens; | ||||
|  | ||||
|             common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); | ||||
| @@ -2584,15 +2712,18 @@ struct server_context { | ||||
|         int32_t n_batch  = llama_n_batch(ctx); | ||||
|         int32_t n_ubatch = llama_n_ubatch(ctx); | ||||
|  | ||||
|         // track if this is an embedding or non-embedding batch | ||||
|         // if we've added sampled tokens above, we are in non-embedding mode | ||||
|         // -1: none, 0: non-embedding, 1: embedding | ||||
|         // TODO: make enum | ||||
|         int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; | ||||
|  | ||||
|         // next, batch any pending prompts without exceeding n_batch | ||||
|         if (params_base.cont_batching || batch.n_tokens == 0) { | ||||
|             for (auto & slot : slots) { | ||||
|                 // check if we can batch this slot with the previous one | ||||
|                 if (slot.is_processing()) { | ||||
|                     if (!slot_batched) { | ||||
|                         slot_batched = &slot; | ||||
|                     } else if (!slot_batched->can_batch_with(slot)) { | ||||
|                         continue; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 // this slot still has a prompt to be processed | ||||
|                 if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { | ||||
|                     auto & prompt_tokens = slot.prompt_tokens; | ||||
| @@ -2753,14 +2884,6 @@ struct server_context { | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // check that we are in the right batch_type, if not defer the slot | ||||
|                     int slot_type = slot.is_non_causal(); | ||||
|                     if (batch_type == -1) { | ||||
|                         batch_type = slot_type; | ||||
|                     } else if (batch_type != slot_type) { | ||||
|                         continue; | ||||
|                     } | ||||
|  | ||||
|                     // keep only the common part | ||||
|                     if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { | ||||
|                         // could not partially delete (likely using a non-Transformer model) | ||||
| @@ -2828,8 +2951,12 @@ struct server_context { | ||||
|  | ||||
|         SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); | ||||
|  | ||||
|         // make sure we're in the right embedding mode | ||||
|         llama_set_embeddings(ctx, batch_type == 1); | ||||
|         if (slot_batched) { | ||||
|             // make sure we're in the right embedding mode | ||||
|             llama_set_embeddings(ctx, slot_batched->is_non_causal()); | ||||
|             // apply lora, only need to do it once per batch | ||||
|             common_set_adapter_lora(ctx, slot_batched->lora); | ||||
|         } | ||||
|  | ||||
|         // process the created batch of tokens | ||||
|         for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { | ||||
| @@ -3030,12 +3157,12 @@ struct server_context { | ||||
|  | ||||
|     json model_meta() const { | ||||
|         return json { | ||||
|             {"vocab_type",  llama_vocab_type    (model)}, | ||||
|             {"n_vocab",     llama_n_vocab       (model)}, | ||||
|             {"n_ctx_train", llama_n_ctx_train   (model)}, | ||||
|             {"n_embd",      llama_n_embd        (model)}, | ||||
|             {"n_params",    llama_model_n_params(model)}, | ||||
|             {"size",        llama_model_size    (model)}, | ||||
|             {"vocab_type",  llama_vocab_type       (vocab)}, | ||||
|             {"n_vocab",     llama_vocab_n_tokens   (vocab)}, | ||||
|             {"n_ctx_train", llama_model_n_ctx_train(model)}, | ||||
|             {"n_embd",      llama_model_n_embd     (model)}, | ||||
|             {"n_params",    llama_model_n_params   (model)}, | ||||
|             {"size",        llama_model_size       (model)}, | ||||
|         }; | ||||
|     } | ||||
| }; | ||||
| @@ -3539,12 +3666,11 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|     // handle completion-like requests (completion, chat, infill) | ||||
|     // we can optionally provide a custom format for partial results and final results | ||||
|     const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( | ||||
|     const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok]( | ||||
|             server_task_type type, | ||||
|             json & data, | ||||
|             httplib::Response & res, | ||||
|             bool oaicompat = false, | ||||
|             bool oaicompat_chat = false) { | ||||
|             oaicompat_type oaicompat) { | ||||
|         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); | ||||
|  | ||||
|         if (ctx_server.params_base.embedding) { | ||||
| @@ -3556,7 +3682,7 @@ int main(int argc, char ** argv) { | ||||
|         std::vector<server_task> tasks; | ||||
|  | ||||
|         try { | ||||
|             std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true); | ||||
|             std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true); | ||||
|             tasks.reserve(tokenized_prompts.size()); | ||||
|             for (size_t i = 0; i < tokenized_prompts.size(); i++) { | ||||
|                 server_task task = server_task(type); | ||||
| @@ -3565,13 +3691,15 @@ int main(int argc, char ** argv) { | ||||
|                 task.index = i; | ||||
|  | ||||
|                 task.prompt_tokens    = std::move(tokenized_prompts[i]); | ||||
|                 task.params           = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data); | ||||
|                 task.params           = server_task::params_from_json_cmpl( | ||||
|                                             ctx_server.ctx, | ||||
|                                             ctx_server.params_base, | ||||
|                                             data); | ||||
|                 task.id_selected_slot = json_value(data, "id_slot", -1); | ||||
|  | ||||
|                 // OAI-compat | ||||
|                 task.params.oaicompat           = oaicompat; | ||||
|                 task.params.oaicompat_chat      = oaicompat_chat; | ||||
|                 task.params.oaicompat_cmpl_id   = completion_id; | ||||
|                 task.params.oaicompat         = oaicompat; | ||||
|                 task.params.oaicompat_cmpl_id = completion_id; | ||||
|                 // oaicompat_model is already populated by params_from_json_cmpl | ||||
|  | ||||
|                 tasks.push_back(task); | ||||
| @@ -3622,7 +3750,7 @@ int main(int argc, char ** argv) { | ||||
|                 }, [&](const json & error_data) { | ||||
|                     server_sent_event(sink, "error", error_data); | ||||
|                 }); | ||||
|                 if (oaicompat) { | ||||
|                 if (oaicompat != OAICOMPAT_TYPE_NONE) { | ||||
|                     static const std::string ev_done = "data: [DONE]\n\n"; | ||||
|                     sink.write(ev_done.data(), ev_done.size()); | ||||
|                 } | ||||
| @@ -3638,26 +3766,34 @@ int main(int argc, char ** argv) { | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { | ||||
|     const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { | ||||
|         json data = json::parse(req.body); | ||||
|         return handle_completions_generic( | ||||
|         return handle_completions_impl( | ||||
|             SERVER_TASK_TYPE_COMPLETION, | ||||
|             data, | ||||
|             res, | ||||
|             /* oaicompat */ false, | ||||
|             /* oaicompat_chat */ false); | ||||
|             OAICOMPAT_TYPE_NONE); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { | ||||
|     const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { | ||||
|         json data = oaicompat_completion_params_parse(json::parse(req.body)); | ||||
|         return handle_completions_impl( | ||||
|             SERVER_TASK_TYPE_COMPLETION, | ||||
|             data, | ||||
|             res, | ||||
|             OAICOMPAT_TYPE_COMPLETION); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { | ||||
|         // check model compatibility | ||||
|         std::string err; | ||||
|         if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { | ||||
|         if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { | ||||
|             err += "prefix token is missing. "; | ||||
|         } | ||||
|         if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { | ||||
|         if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { | ||||
|             err += "suffix token is missing. "; | ||||
|         } | ||||
|         if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { | ||||
|         if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { | ||||
|             err += "middle token is missing. "; | ||||
|         } | ||||
|         if (!err.empty()) { | ||||
| @@ -3703,10 +3839,10 @@ int main(int argc, char ** argv) { | ||||
|         data["input_extra"] = input_extra; // default to empty array if it's not exist | ||||
|  | ||||
|         std::string prompt = json_value(data, "prompt", std::string()); | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true); | ||||
|         SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); | ||||
|         data["prompt"] = format_infill( | ||||
|             ctx_server.ctx, | ||||
|             ctx_server.vocab, | ||||
|             data.at("input_prefix"), | ||||
|             data.at("input_suffix"), | ||||
|             data.at("input_extra"), | ||||
| @@ -3717,10 +3853,14 @@ int main(int argc, char ** argv) { | ||||
|             tokenized_prompts[0] | ||||
|         ); | ||||
|  | ||||
|         return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); | ||||
|         return handle_completions_impl( | ||||
|             SERVER_TASK_TYPE_INFILL, | ||||
|             data, | ||||
|             res, | ||||
|             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible | ||||
|     }; | ||||
|  | ||||
|     const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic, &get_chat_templates](const httplib::Request & req, httplib::Response & res) { | ||||
|     const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl, &get_chat_templates](const httplib::Request & req, httplib::Response & res) { | ||||
|         if (ctx_server.params_base.embedding) { | ||||
|             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); | ||||
|             return; | ||||
| @@ -3731,12 +3871,11 @@ int main(int argc, char ** argv) { | ||||
|         const auto & chat_template = body.contains("tools") && templates.tool_use_template ? *templates.tool_use_template : templates.default_template; | ||||
|         json data = oaicompat_completion_params_parse(ctx_server.model, body, chat_template, params.use_jinja); | ||||
|  | ||||
|         return handle_completions_generic( | ||||
|         return handle_completions_impl( | ||||
|             SERVER_TASK_TYPE_COMPLETION, | ||||
|             data, | ||||
|             res, | ||||
|             /* oaicompat */ true, | ||||
|             /* oaicompat_chat */ true); | ||||
|             OAICOMPAT_TYPE_CHAT); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { | ||||
| @@ -3764,7 +3903,7 @@ int main(int argc, char ** argv) { | ||||
|             const bool add_special = json_value(body, "add_special", false); | ||||
|             const bool with_pieces = json_value(body, "with_pieces", false); | ||||
|  | ||||
|             llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); | ||||
|             llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true); | ||||
|  | ||||
|             if (with_pieces) { | ||||
|                 for (const auto& token : tokens) { | ||||
| @@ -3809,10 +3948,10 @@ int main(int argc, char ** argv) { | ||||
|         res_ok(res, data); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) { | ||||
|     const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) { | ||||
|         const json body = json::parse(req.body); | ||||
|  | ||||
|         if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { | ||||
|         if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { | ||||
|             res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); | ||||
|             return; | ||||
|         } | ||||
| @@ -3822,7 +3961,7 @@ int main(int argc, char ** argv) { | ||||
|         if (body.count("input") != 0) { | ||||
|             prompt = body.at("input"); | ||||
|         } else if (body.contains("content")) { | ||||
|             oaicompat = false; | ||||
|             oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible | ||||
|             prompt = body.at("content"); | ||||
|         } else { | ||||
|             res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); | ||||
| @@ -3840,7 +3979,7 @@ int main(int argc, char ** argv) { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); | ||||
|         for (const auto & tokens : tokenized_prompts) { | ||||
|             // this check is necessary for models that do not add BOS token to the input | ||||
|             if (tokens.empty()) { | ||||
| @@ -3891,16 +4030,18 @@ int main(int argc, char ** argv) { | ||||
|         } | ||||
|  | ||||
|         // write JSON response | ||||
|         json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); | ||||
|         json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING | ||||
|             ? format_embeddings_response_oaicompat(body, responses, use_base64) | ||||
|             : json(responses); | ||||
|         res_ok(res, root); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { | ||||
|         handle_embeddings_impl(req, res, false); | ||||
|         handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { | ||||
|         handle_embeddings_impl(req, res, true); | ||||
|         handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING); | ||||
|     }; | ||||
|  | ||||
|     const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { | ||||
| @@ -3938,20 +4079,20 @@ int main(int argc, char ** argv) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0]; | ||||
|         llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0]; | ||||
|  | ||||
|         // create and queue the task | ||||
|         json responses = json::array(); | ||||
|         bool error = false; | ||||
|         { | ||||
|             std::vector<server_task> tasks; | ||||
|             std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true); | ||||
|             std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true); | ||||
|             tasks.reserve(tokenized_docs.size()); | ||||
|             for (size_t i = 0; i < tokenized_docs.size(); i++) { | ||||
|                 server_task task   = server_task(SERVER_TASK_TYPE_RERANK); | ||||
|                 task.id            = ctx_server.queue_tasks.get_new_id(); | ||||
|                 task.index         = i; | ||||
|                 task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]); | ||||
|                 task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]); | ||||
|                 tasks.push_back(task); | ||||
|             } | ||||
|  | ||||
| @@ -3983,8 +4124,9 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|     const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { | ||||
|         json result = json::array(); | ||||
|         for (size_t i = 0; i < ctx_server.loras.size(); ++i) { | ||||
|             auto & lora = ctx_server.loras[i]; | ||||
|         const auto & loras = ctx_server.params_base.lora_adapters; | ||||
|         for (size_t i = 0; i < loras.size(); ++i) { | ||||
|             auto & lora = loras[i]; | ||||
|             result.push_back({ | ||||
|                 {"id", i}, | ||||
|                 {"path", lora.path}, | ||||
| @@ -3996,27 +4138,14 @@ int main(int argc, char ** argv) { | ||||
|     }; | ||||
|  | ||||
|     const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { | ||||
|         const std::vector<json> body = json::parse(req.body); | ||||
|         int max_idx = ctx_server.loras.size(); | ||||
|  | ||||
|         // clear existing value | ||||
|         for (auto & lora : ctx_server.loras) { | ||||
|             lora.scale = 0.0f; | ||||
|         const json body = json::parse(req.body); | ||||
|         if (!body.is_array()) { | ||||
|             res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         // set value | ||||
|         for (auto entry : body) { | ||||
|             int id      = entry.at("id"); | ||||
|             float scale = entry.at("scale"); | ||||
|             if (0 <= id && id < max_idx) { | ||||
|                 ctx_server.loras[id].scale = scale; | ||||
|             } else { | ||||
|                 throw std::runtime_error("invalid adapter id"); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         server_task task(SERVER_TASK_TYPE_SET_LORA); | ||||
|         task.id = ctx_server.queue_tasks.get_new_id(); | ||||
|         task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); | ||||
|         ctx_server.queue_results.add_waiting_task_id(task.id); | ||||
|         ctx_server.queue_tasks.post(task); | ||||
|  | ||||
| @@ -4070,7 +4199,7 @@ int main(int argc, char ** argv) { | ||||
|     svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check) | ||||
|     svr->Post("/completion",          handle_completions); // legacy | ||||
|     svr->Post("/completions",         handle_completions); | ||||
|     svr->Post("/v1/completions",      handle_completions); | ||||
|     svr->Post("/v1/completions",      handle_completions_oai); | ||||
|     svr->Post("/chat/completions",    handle_chat_completions); | ||||
|     svr->Post("/v1/chat/completions", handle_chat_completions); | ||||
|     svr->Post("/infill",              handle_infill); | ||||
| @@ -4150,14 +4279,16 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|     // if a custom chat template is not supplied, we will use the one that comes with the model (if any) | ||||
|     if (params.chat_template.empty()) { | ||||
|         if (!ctx_server.validate_model_chat_template(params.use_jinja)) { | ||||
|         if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) { | ||||
|             LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); | ||||
|             params.chat_template = "chatml"; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // print sample chat example to make it clear which template is used | ||||
|     LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str()); | ||||
|     LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, | ||||
|         params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(), | ||||
|         common_chat_format_example(ctx_server.model, params.chat_template).c_str()); | ||||
|  | ||||
|     ctx_server.queue_tasks.on_new_task(std::bind( | ||||
|                 &server_context::process_single_task, &ctx_server, std::placeholders::_1)); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 ochafik
					ochafik