llama : add --no-host to disable host buffers (#16310)

* implement --no-host to disable host buffer * fix equal_mparams * move no-host enumeration order together with other model params --------- Co-authored-by: slaren <slarengh@gmail.com>
2025-10-27 08:21:30 +00:00 · 2025-10-06 12:55:53 -05:00
parent c08002a198
commit 3df2244df4
6 changed files with 56 additions and 10 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2584,6 +2584,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.no_extra_bufts = true;
        }
    ).set_env("LLAMA_ARG_NO_REPACK"));
+    add_opt(common_arg(
+        {"--no-host"},
+        "bypass host buffer allowing extra buffers to be used",
+        [](common_params & params) {
+            params.no_host = true;
+        }
+    ).set_env("LLAMA_ARG_NO_HOST"));
    add_opt(common_arg(
        {"-ctk", "--cache-type-k"}, "TYPE",
        string_format(
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1133,6 +1133,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
+    mparams.no_host         = params.no_host;

    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
--- a/common/common.h
+++ b/common/common.h
@@ -392,6 +392,7 @@ struct common_params {
    bool check_tensors     = false; // validate tensor data
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
+    bool no_host           = false; // bypass host buffer allowing extra buffers to be used

    bool single_turn       = false; // single turn chat conversation