mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-06 09:46:50 +00:00
server : support unified cache across slots (#16736)
* server : support unified context across slots * cont : fix speculative decoding initialization * context : fix n_ctx_per_seq computation * server : purge slots one by one * tests : add unified cache server tests * llama : update per-seq context computation * test-thread-safety : handle tiny training context of the input model * server : fix server_tokens clear() * server : use 4 slots + unified KV by default * llama : add note about context size queries * cont : update todos [no ci] * context : do not cap the size of the context * tests : adjust parameters to be CI friendlier * context : add warning
This commit is contained in:
@@ -461,7 +461,10 @@ extern "C" {
|
||||
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||
LLAMA_API bool llama_supports_rpc (void);
|
||||
|
||||
// NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
|
||||
// In some cases the requested values via llama_context_params may differ from the actual values used by the context
|
||||
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||
@@ -585,7 +588,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
|
||||
Reference in New Issue
Block a user