server : add server parameters for draft model cache type (#13782)

Co-authored-by: aa956 <27946957+aa956@users.noreply.github.com>
2025-11-02 09:12:03 +00:00 · 2025-06-19 16:01:03 +03:00
parent 456af35eb7
commit d67341dc18
4 changed files with 33 additions and 4 deletions
--- a/common/common.h
+++ b/common/common.h
@@ -199,6 +199,9 @@ struct common_params_speculative {
    float   p_split      =  0.1f; // speculative decoding split probability
    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)

+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;