server : enable /slots by default and make it secure (#15630)

* server : enable /slots by default and make it secure

ggml-ci

* server : fix tests to pass `--no-slots` when necessary

* server : extend /props with info about enabled endpoints
This commit is contained in:
Georgi Gerganov
2025-08-31 20:11:58 +03:00
committed by GitHub
parent 4efd5a8316
commit 0d161f021a
5 changed files with 200 additions and 47 deletions

View File

@@ -141,7 +141,7 @@ struct slot_params {
// Embeddings
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
json to_json() const {
json to_json(bool only_metrics = false) const {
std::vector<std::string> samplers;
samplers.reserve(sampling.samplers.size());
for (const auto & sampler : sampling.samplers) {
@@ -153,9 +153,55 @@ struct slot_params {
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
}
if (only_metrics) {
return json {
{"n_predict", n_predict}, // Server configured n_predict
{"seed", sampling.seed},
{"temperature", sampling.temp},
{"dynatemp_range", sampling.dynatemp_range},
{"dynatemp_exponent", sampling.dynatemp_exponent},
{"top_k", sampling.top_k},
{"top_p", sampling.top_p},
{"min_p", sampling.min_p},
{"top_n_sigma", sampling.top_n_sigma},
{"xtc_probability", sampling.xtc_probability},
{"xtc_threshold", sampling.xtc_threshold},
{"typical_p", sampling.typ_p},
{"repeat_last_n", sampling.penalty_last_n},
{"repeat_penalty", sampling.penalty_repeat},
{"presence_penalty", sampling.penalty_present},
{"frequency_penalty", sampling.penalty_freq},
{"dry_multiplier", sampling.dry_multiplier},
{"dry_base", sampling.dry_base},
{"dry_allowed_length", sampling.dry_allowed_length},
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
{"mirostat", sampling.mirostat},
{"mirostat_tau", sampling.mirostat_tau},
{"mirostat_eta", sampling.mirostat_eta},
{"max_tokens", n_predict}, // User configured n_predict
{"n_keep", n_keep},
{"n_discard", n_discard},
{"ignore_eos", sampling.ignore_eos},
{"stream", stream},
{"n_probs", sampling.n_probs},
{"min_keep", sampling.min_keep},
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
{"speculative.p_min", speculative.p_min},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
{"lora", lora},
};
}
auto grammar_triggers = json::array();
for (const auto & trigger : sampling.grammar_triggers) {
server_grammar_trigger ct(std::move(trigger));
server_grammar_trigger ct(trigger);
grammar_triggers.push_back(ct.to_json());
}
@@ -1572,7 +1618,26 @@ struct server_slot {
}
}
json to_json() const {
json to_json(bool only_metrics = false) const {
if (only_metrics) {
return json {
{"id", id},
{"id_task", id_task},
{"n_ctx", n_ctx},
{"speculative", can_speculate()},
{"is_processing", is_processing()},
{"params", params.to_json(true)},
{"next_token",
{
{"has_next_token", has_next_token},
{"has_new_line", has_new_line},
{"n_remain", n_remaining},
{"n_decoded", n_decoded},
}
},
};
}
return json {
{"id", id},
{"id_task", id_task},
@@ -2874,7 +2939,7 @@ struct server_context {
int n_processing_slots = 0;
for (server_slot & slot : slots) {
json slot_data = slot.to_json();
json slot_data = slot.to_json(true);
if (slot.is_processing()) {
n_processing_slots++;
@@ -4271,16 +4336,20 @@ int main(int argc, char ** argv) {
}
};
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
const auto handle_props = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
// this endpoint is publicly available, please only return what is safe to be exposed
json data = {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model.path },
{ "modalities", json{
{ "modalities", json {
{"vision", ctx_server.oai_parser_opt.allow_image},
{"audio", ctx_server.oai_parser_opt.allow_audio},
} },
{ "endpoint_slots", params.endpoint_slots },
{ "endpoint_props", params.endpoint_props },
{ "endpoint_metrics", params.endpoint_metrics },
{ "webui", params.webui },
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},