Merge branch 'master' into xsn/private_batch_api

2025-10-31 08:51:55 +00:00 · 2025-03-18 15:45:22 +01:00
parent eab5606d7b 8551c44d84
commit dc4bb64290
76 changed files with 3990 additions and 902 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1868,6 +1868,10 @@ struct server_context {
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            params_dft.n_parallel   = 1;

+            // force F16 KV cache for the draft model for extra performance
+            params_dft.cache_type_k = GGML_TYPE_F16;
+            params_dft.cache_type_v = GGML_TYPE_F16;
+
            llama_init_dft = common_init_from_params(params_dft);

            model_dft = llama_init_dft.model.get();
@@ -1888,10 +1892,6 @@ struct server_context {
            cparams_dft = common_context_params_to_llama(params_dft);
            cparams_dft.n_batch = n_ctx_dft;

-            // force F16 KV cache for the draft model for extra performance
-            cparams_dft.type_k = GGML_TYPE_F16;
-            cparams_dft.type_v = GGML_TYPE_F16;
-
            // the context is not needed - we will create one for each slot
            llama_init_dft.context.reset();
        }