mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
server : dynamic token limit for prompt cache (#16560)
* server : dynamic token limit for prompt cache * cont : print estimated token limit
This commit is contained in:
@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// average size per token
|
||||||
|
const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
|
||||||
|
|
||||||
|
// dynamically increase the token limit if it can fit in the memory limit
|
||||||
|
const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
|
||||||
|
|
||||||
if (limit_tokens > 0) {
|
if (limit_tokens > 0) {
|
||||||
while (states.size() > 1 && n_tokens() > limit_tokens) {
|
while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
|
||||||
if (states.empty()) {
|
if (states.empty()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
|
SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
|
||||||
|
limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
|
||||||
|
|
||||||
states.pop_front();
|
states.pop_front();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n",
|
SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
|
||||||
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
|
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
|
||||||
|
|
||||||
for (const auto & state : states) {
|
for (const auto & state : states) {
|
||||||
SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
|
SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
|
||||||
|
(const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user