mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
llama-bench: add --devices and --list-devices support (#16039)
* * llama-bench: add --devices support - Support --devices same as llama-server - Provide for benchmarking different device combinations - Include --list-devices like llama-server for convenience * fix: field display ordering restored * fix: integrated the rpc devices - aimed to mimic the server as much as possible * cleanup: defaults for list-devices - handle dup device listing with RPC * cleanup: remove dup device load calls * docs: update llama-bench - added the recently added n-cpu-moe option to the docs while in there * llama-bench: rpc device simplification * rpc servers unify with other devices earlier, simplifying code * --list-devices made stateless and simpler * various cleanup
This commit is contained in:
@@ -30,8 +30,10 @@ options:
|
||||
--delay <0...N> (seconds) delay between each test (default: 0)
|
||||
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
|
||||
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
|
||||
--list-devices list available devices and exit
|
||||
-v, --verbose verbose output
|
||||
--progress print test progress indicators
|
||||
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
|
||||
|
||||
test parameters:
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
@@ -48,11 +50,12 @@ test parameters:
|
||||
--cpu-strict <0|1> (default: 0)
|
||||
--poll <0...100> (default: 50)
|
||||
-ngl, --n-gpu-layers <n> (default: 99)
|
||||
-rpc, --rpc <rpc_servers> (default: none)
|
||||
-ncmoe, --n-cpu-moe <n> (default: 0)
|
||||
-sm, --split-mode <none|layer|row> (default: layer)
|
||||
-mg, --main-gpu <i> (default: 0)
|
||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||
-fa, --flash-attn <0|1> (default: 0)
|
||||
-dev, --device <dev0/dev1/...> (default: auto)
|
||||
-mmp, --mmap <0|1> (default: 1)
|
||||
-embd, --embeddings <0|1> (default: 0)
|
||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
@@ -135,6 +136,101 @@ static std::string get_gpu_info() {
|
||||
return join(gpu_list, ", ");
|
||||
}
|
||||
|
||||
static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::string trimmed = string_strip(value);
|
||||
if (trimmed.empty()) {
|
||||
throw std::invalid_argument("no devices specified");
|
||||
}
|
||||
if (trimmed == "auto") {
|
||||
return devices;
|
||||
}
|
||||
|
||||
auto dev_names = string_split<std::string>(trimmed, '/');
|
||||
if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
|
||||
devices.push_back(nullptr);
|
||||
return devices;
|
||||
}
|
||||
|
||||
for (auto & name : dev_names) {
|
||||
std::string dev_name = string_strip(name);
|
||||
if (dev_name.empty()) {
|
||||
throw std::invalid_argument("invalid device specification");
|
||||
}
|
||||
auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
|
||||
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
|
||||
}
|
||||
devices.push_back(dev);
|
||||
}
|
||||
|
||||
devices.push_back(nullptr);
|
||||
return devices;
|
||||
}
|
||||
|
||||
static std::vector<ggml_backend_dev_t> register_rpc_device_list(const std::string & servers) {
|
||||
auto rpc_servers = string_split<std::string>(servers, ',');
|
||||
if (rpc_servers.empty()) {
|
||||
throw std::invalid_argument("no RPC servers specified");
|
||||
}
|
||||
|
||||
auto * rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
throw std::invalid_argument("failed to find RPC backend");
|
||||
}
|
||||
|
||||
using add_rpc_device_fn = ggml_backend_dev_t (*)(const char * endpoint);
|
||||
auto * ggml_backend_rpc_add_device_fn = (add_rpc_device_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
throw std::invalid_argument("failed to find RPC device add function");
|
||||
}
|
||||
|
||||
static std::unordered_set<std::string> registered;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (const auto & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = nullptr;
|
||||
|
||||
std::string name = string_format("RPC[%s]", server.c_str());
|
||||
|
||||
if (registered.find(server) != registered.end()) {
|
||||
dev = ggml_backend_dev_by_name(name.c_str());
|
||||
}
|
||||
|
||||
if (!dev) {
|
||||
dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (!dev) {
|
||||
throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str()));
|
||||
}
|
||||
ggml_backend_device_register(dev);
|
||||
registered.insert(server);
|
||||
}
|
||||
|
||||
devices.push_back(dev);
|
||||
}
|
||||
|
||||
return devices;
|
||||
}
|
||||
|
||||
static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "auto";
|
||||
}
|
||||
|
||||
if (devices.size() == 1 && devices[0] == nullptr) {
|
||||
return "none";
|
||||
}
|
||||
|
||||
std::vector<std::string> names;
|
||||
for (auto * dev : devices) {
|
||||
if (dev == nullptr) {
|
||||
break;
|
||||
}
|
||||
names.push_back(ggml_backend_dev_name(dev));
|
||||
}
|
||||
|
||||
return join(names, "/");
|
||||
}
|
||||
|
||||
// command line params
|
||||
enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
|
||||
|
||||
@@ -251,11 +347,11 @@ struct cmd_params {
|
||||
std::vector<int> poll;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<int> n_cpu_moe;
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> flash_attn;
|
||||
std::vector<std::vector<ggml_backend_dev_t>> devices;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||
std::vector<bool> use_mmap;
|
||||
@@ -288,11 +384,11 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* poll */ { 50 },
|
||||
/* n_gpu_layers */ { 99 },
|
||||
/* n_cpu_moe */ { 0 },
|
||||
/* rpc_servers */ { "" },
|
||||
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
||||
/* main_gpu */ { 0 },
|
||||
/* no_kv_offload */ { false },
|
||||
/* flash_attn */ { false },
|
||||
/* devices */ { {} },
|
||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
||||
/* use_mmap */ { true },
|
||||
@@ -325,9 +421,13 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
|
||||
output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" --list-devices list available devices and exit\n");
|
||||
printf(" -v, --verbose verbose output\n");
|
||||
printf(" --progress print test progress indicators\n");
|
||||
printf(" --no-warmup skip warmup runs before benchmarking\n");
|
||||
if (llama_supports_rpc()) {
|
||||
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf("test parameters:\n");
|
||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||
@@ -357,10 +457,6 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
|
||||
if (llama_supports_rpc()) {
|
||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
|
||||
join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||
}
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n",
|
||||
@@ -369,6 +465,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
||||
@@ -533,6 +630,42 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
||||
} else if (arg == "-dev" || arg == "--device") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto combos = string_split<std::string>(argv[i], split_delim);
|
||||
for (const auto & combo : combos) {
|
||||
try {
|
||||
params.devices.push_back(parse_devices_arg(combo));
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
break;
|
||||
}
|
||||
} else if (arg == "--list-devices") {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
devices.push_back(dev);
|
||||
}
|
||||
}
|
||||
printf("Available devices:\n");
|
||||
if (devices.empty()) {
|
||||
printf(" (none)\n");
|
||||
}
|
||||
for (auto * dev : devices) {
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
}
|
||||
exit(0);
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -580,7 +713,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rpc_servers.push_back(argv[i]);
|
||||
try {
|
||||
register_rpc_device_list(argv[i]);
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -855,9 +994,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.n_cpu_moe.empty()) {
|
||||
params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
|
||||
}
|
||||
if (params.rpc_servers.empty()) {
|
||||
params.rpc_servers = cmd_params_defaults.rpc_servers;
|
||||
}
|
||||
if (params.split_mode.empty()) {
|
||||
params.split_mode = cmd_params_defaults.split_mode;
|
||||
}
|
||||
@@ -870,6 +1006,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.flash_attn.empty()) {
|
||||
params.flash_attn = cmd_params_defaults.flash_attn;
|
||||
}
|
||||
if (params.devices.empty()) {
|
||||
params.devices = cmd_params_defaults.devices;
|
||||
}
|
||||
if (params.tensor_split.empty()) {
|
||||
params.tensor_split = cmd_params_defaults.tensor_split;
|
||||
}
|
||||
@@ -916,11 +1055,11 @@ struct cmd_params_instance {
|
||||
int poll;
|
||||
int n_gpu_layers;
|
||||
int n_cpu_moe;
|
||||
std::string rpc_servers_str;
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
@@ -931,57 +1070,8 @@ struct cmd_params_instance {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
|
||||
mparams.n_gpu_layers = n_gpu_layers;
|
||||
if (!rpc_servers_str.empty()) {
|
||||
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
|
||||
|
||||
// add RPC devices
|
||||
if (!rpc_servers.empty()) {
|
||||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
||||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
static std::vector<ggml_backend_dev_t> devices;
|
||||
devices.clear();
|
||||
// RPC devices should always come first for performance reasons
|
||||
for (const std::string & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (dev) {
|
||||
devices.push_back(dev);
|
||||
} else {
|
||||
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// FIXME: use llama.cpp device selection logic
|
||||
// add local GPU devices if any
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
devices.push_back(dev);
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||
// iGPUs are not used when there are RPC servers
|
||||
break;
|
||||
}
|
||||
}
|
||||
devices.push_back(nullptr);
|
||||
mparams.devices = devices.data();
|
||||
}
|
||||
if (!devices.empty()) {
|
||||
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
|
||||
}
|
||||
mparams.split_mode = split_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
@@ -1029,8 +1119,9 @@ struct cmd_params_instance {
|
||||
|
||||
bool equal_mparams(const cmd_params_instance & other) const {
|
||||
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
|
||||
rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
|
||||
split_mode == other.split_mode &&
|
||||
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
|
||||
devices == other.devices &&
|
||||
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
||||
}
|
||||
|
||||
@@ -1060,9 +1151,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & m : params.model)
|
||||
for (const auto & nl : params.n_gpu_layers)
|
||||
for (const auto & ncmoe : params.n_cpu_moe)
|
||||
for (const auto & rpc : params.rpc_servers)
|
||||
for (const auto & sm : params.split_mode)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & devs : params.devices)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & ot : params.tensor_buft_overrides)
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
@@ -1098,11 +1189,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .n_cpu_moe = */ ncmoe,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
@@ -1131,11 +1222,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .n_cpu_moe = */ ncmoe,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
@@ -1164,11 +1255,11 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .n_cpu_moe = */ ncmoe,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
/* .use_mmap = */ mmp,
|
||||
@@ -1206,6 +1297,7 @@ struct test {
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
bool use_mmap;
|
||||
@@ -1241,6 +1333,7 @@ struct test {
|
||||
main_gpu = inst.main_gpu;
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
flash_attn = inst.flash_attn;
|
||||
devices = inst.devices;
|
||||
tensor_split = inst.tensor_split;
|
||||
tensor_buft_overrides = inst.tensor_buft_overrides;
|
||||
use_mmap = inst.use_mmap;
|
||||
@@ -1287,14 +1380,14 @@ struct test {
|
||||
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends",
|
||||
"model_filename", "model_type", "model_size", "model_n_params", "n_batch",
|
||||
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen",
|
||||
"n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts",
|
||||
"stddev_ts"
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends",
|
||||
"model_filename", "model_type", "model_size", "model_n_params", "n_batch",
|
||||
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
|
||||
"tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload",
|
||||
"n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
|
||||
"stddev_ns", "avg_ts", "stddev_ts"
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -1378,6 +1471,7 @@ struct test {
|
||||
std::to_string(main_gpu),
|
||||
std::to_string(no_kv_offload),
|
||||
std::to_string(flash_attn),
|
||||
devices_to_string(devices),
|
||||
tensor_split_str,
|
||||
tensor_buft_overrides_str,
|
||||
std::to_string(use_mmap),
|
||||
@@ -1559,6 +1653,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
}
|
||||
if (field == "devices") {
|
||||
return -12;
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return 4;
|
||||
}
|
||||
@@ -1602,6 +1699,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "no_op_offload") {
|
||||
return "nopo";
|
||||
}
|
||||
if (field == "devices") {
|
||||
return "dev";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -1661,6 +1761,9 @@ struct markdown_printer : public printer {
|
||||
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
||||
fields.emplace_back("flash_attn");
|
||||
}
|
||||
if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
|
||||
fields.emplace_back("devices");
|
||||
}
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.emplace_back("tensor_split");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user