llama : add --no-host to disable host buffers (#16310)

* implement --no-host to disable host buffer

* fix equal_mparams

* move no-host enumeration order together with other model params

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Gadflyii
2025-10-06 12:55:53 -05:00
committed by GitHub
parent c08002a198
commit 3df2244df4
6 changed files with 56 additions and 10 deletions

View File

@@ -2584,6 +2584,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.no_extra_bufts = true;
}
).set_env("LLAMA_ARG_NO_REPACK"));
add_opt(common_arg(
{"--no-host"},
"bypass host buffer allowing extra buffers to be used",
[](common_params & params) {
params.no_host = true;
}
).set_env("LLAMA_ARG_NO_HOST"));
add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE",
string_format(

View File

@@ -1133,6 +1133,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
mparams.no_host = params.no_host;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;

View File

@@ -392,6 +392,7 @@ struct common_params {
bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
bool no_host = false; // bypass host buffer allowing extra buffers to be used
bool single_turn = false; // single turn chat conversation