mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
server : enable /slots by default and make it secure (#15630)
* server : enable /slots by default and make it secure ggml-ci * server : fix tests to pass `--no-slots` when necessary * server : extend /props with info about enabled endpoints
This commit is contained in:
@@ -141,7 +141,7 @@ struct slot_params {
|
||||
// Embeddings
|
||||
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||
|
||||
json to_json() const {
|
||||
json to_json(bool only_metrics = false) const {
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(sampling.samplers.size());
|
||||
for (const auto & sampler : sampling.samplers) {
|
||||
@@ -153,9 +153,55 @@ struct slot_params {
|
||||
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
||||
}
|
||||
|
||||
if (only_metrics) {
|
||||
return json {
|
||||
{"n_predict", n_predict}, // Server configured n_predict
|
||||
{"seed", sampling.seed},
|
||||
{"temperature", sampling.temp},
|
||||
{"dynatemp_range", sampling.dynatemp_range},
|
||||
{"dynatemp_exponent", sampling.dynatemp_exponent},
|
||||
{"top_k", sampling.top_k},
|
||||
{"top_p", sampling.top_p},
|
||||
{"min_p", sampling.min_p},
|
||||
{"top_n_sigma", sampling.top_n_sigma},
|
||||
{"xtc_probability", sampling.xtc_probability},
|
||||
{"xtc_threshold", sampling.xtc_threshold},
|
||||
{"typical_p", sampling.typ_p},
|
||||
{"repeat_last_n", sampling.penalty_last_n},
|
||||
{"repeat_penalty", sampling.penalty_repeat},
|
||||
{"presence_penalty", sampling.penalty_present},
|
||||
{"frequency_penalty", sampling.penalty_freq},
|
||||
{"dry_multiplier", sampling.dry_multiplier},
|
||||
{"dry_base", sampling.dry_base},
|
||||
{"dry_allowed_length", sampling.dry_allowed_length},
|
||||
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
|
||||
{"mirostat", sampling.mirostat},
|
||||
{"mirostat_tau", sampling.mirostat_tau},
|
||||
{"mirostat_eta", sampling.mirostat_eta},
|
||||
{"max_tokens", n_predict}, // User configured n_predict
|
||||
{"n_keep", n_keep},
|
||||
{"n_discard", n_discard},
|
||||
{"ignore_eos", sampling.ignore_eos},
|
||||
{"stream", stream},
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
|
||||
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
|
||||
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
|
||||
{"samplers", samplers},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
{"speculative.p_min", speculative.p_min},
|
||||
{"timings_per_token", timings_per_token},
|
||||
{"post_sampling_probs", post_sampling_probs},
|
||||
{"lora", lora},
|
||||
};
|
||||
}
|
||||
|
||||
auto grammar_triggers = json::array();
|
||||
for (const auto & trigger : sampling.grammar_triggers) {
|
||||
server_grammar_trigger ct(std::move(trigger));
|
||||
server_grammar_trigger ct(trigger);
|
||||
grammar_triggers.push_back(ct.to_json());
|
||||
}
|
||||
|
||||
@@ -1572,7 +1618,26 @@ struct server_slot {
|
||||
}
|
||||
}
|
||||
|
||||
json to_json() const {
|
||||
json to_json(bool only_metrics = false) const {
|
||||
if (only_metrics) {
|
||||
return json {
|
||||
{"id", id},
|
||||
{"id_task", id_task},
|
||||
{"n_ctx", n_ctx},
|
||||
{"speculative", can_speculate()},
|
||||
{"is_processing", is_processing()},
|
||||
{"params", params.to_json(true)},
|
||||
{"next_token",
|
||||
{
|
||||
{"has_next_token", has_next_token},
|
||||
{"has_new_line", has_new_line},
|
||||
{"n_remain", n_remaining},
|
||||
{"n_decoded", n_decoded},
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return json {
|
||||
{"id", id},
|
||||
{"id_task", id_task},
|
||||
@@ -2874,7 +2939,7 @@ struct server_context {
|
||||
int n_processing_slots = 0;
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
json slot_data = slot.to_json();
|
||||
json slot_data = slot.to_json(true);
|
||||
|
||||
if (slot.is_processing()) {
|
||||
n_processing_slots++;
|
||||
@@ -4271,16 +4336,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
const auto handle_props = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
// this endpoint is publicly available, please only return what is safe to be exposed
|
||||
json data = {
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model.path },
|
||||
{ "modalities", json{
|
||||
{ "modalities", json {
|
||||
{"vision", ctx_server.oai_parser_opt.allow_image},
|
||||
{"audio", ctx_server.oai_parser_opt.allow_audio},
|
||||
} },
|
||||
{ "endpoint_slots", params.endpoint_slots },
|
||||
{ "endpoint_props", params.endpoint_props },
|
||||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
{ "webui", params.webui },
|
||||
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
|
||||
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
|
||||
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
|
||||
|
||||
Reference in New Issue
Block a user