server : support unified cache across slots (#16736)

* server : support unified context across slots

* cont : fix speculative decoding initialization

* context : fix n_ctx_per_seq computation

* server : purge slots one by one

* tests : add unified cache server tests

* llama : update per-seq context computation

* test-thread-safety : handle tiny training context of the input model

* server : fix server_tokens clear()

* server : use 4 slots + unified KV by default

* llama : add note about context size queries

* cont : update todos [no ci]

* context : do not cap the size of the context

* tests : adjust parameters to be CI friendlier

* context : add warning
This commit is contained in:
Georgi Gerganov
2025-11-02 18:14:04 +02:00
committed by GitHub
parent 87c9efc3b2
commit cd5e3b5754
12 changed files with 163 additions and 48 deletions

View File

@@ -1212,7 +1212,7 @@ public:
for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
auto * chunk = tokens.map_idx_to_media[it->first].get();
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_idx_to_media[start_idx+it->first] = std::move(new_chunk);
map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
}
}
}
@@ -1244,6 +1244,7 @@ public:
}
void clear() {
map_idx_to_media.clear();
tokens.clear();
}