server : dynamic token limit for prompt cache (#16560)

* server : dynamic token limit for prompt cache

* cont : print estimated token limit
This commit is contained in:
Georgi Gerganov
2025-10-14 08:48:50 +03:00
committed by GitHub
parent e60f241eac
commit bc07349a7f

View File

@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
} }
} }
// average size per token
const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
// dynamically increase the token limit if it can fit in the memory limit
const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
if (limit_tokens > 0) { if (limit_tokens > 0) {
while (states.size() > 1 && n_tokens() > limit_tokens) { while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
if (states.empty()) { if (states.empty()) {
break; break;
} }
SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0)); SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
states.pop_front(); states.pop_front();
} }
} }
SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n", SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens); states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
for (const auto & state : states) { for (const auto & state : states) {
SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0)); SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
(const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
} }
} }
}; };