common : update presets (#16504)

* presets : add --embd-gemma-default and remove old embedding presets

* presets : add gpt-oss presets

* presets : add vision presets

* cont : remove reasoning overrides [no ci]

* cont : fix batch size for embedding gemma [no ci]
This commit is contained in:
Georgi Gerganov
2025-10-12 09:29:13 +03:00
committed by GitHub
parent 41aac5c69b
commit 4b2dae383d
2 changed files with 162 additions and 134 deletions

View File

@@ -3358,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg( add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING", {"--chat-template-kwargs"}, "STRING",
string_format("sets additional params for the json template parser"), string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
auto parsed = json::parse(value); auto parsed = json::parse(value);
for (const auto & item : parsed.items()) { for (const auto & item : parsed.items()) {
params.default_template_kwargs[item.key()] = item.value().dump(); params.default_template_kwargs[item.key()] = item.value().dump();
@@ -3570,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_file(common_log_main(), value.c_str()); common_log_set_file(common_log_main(), value.c_str());
} }
)); ));
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]", add_opt(common_arg(
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n" {"--log-colors"}, "[on|off|auto]",
"'auto' enables colors when output is to a terminal", "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
[](common_params &, const std::string & value) { "'auto' enables colors when output is to a terminal",
if (is_truthy(value)) { [](common_params &, const std::string & value) {
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED); if (is_truthy(value)) {
} else if (is_falsey(value)) { common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED); } else if (is_falsey(value)) {
} else if (is_autoy(value)) { common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO); } else if (is_autoy(value)) {
} else { common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
throw std::invalid_argument( } else {
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str())); throw std::invalid_argument(
} string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
}).set_env("LLAMA_LOG_COLORS")); }
}
).set_env("LLAMA_LOG_COLORS"));
add_opt(common_arg( add_opt(common_arg(
{"-v", "--verbose", "--log-verbose"}, {"-v", "--verbose", "--log-verbose"},
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)", "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3850,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_TTS})); ).set_examples({LLAMA_EXAMPLE_TTS}));
// model-specific add_opt(common_arg(
{"--diffusion-steps"}, "N",
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
[](common_params & params, int value) { params.diffusion.steps = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-visual"},
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-eps"}, "F",
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-algorithm"}, "N",
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
[](common_params & params, int value) { params.diffusion.algorithm = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-alg-temp"}, "F",
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-block-length"}, "N",
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
[](common_params & params, int value) { params.diffusion.block_length = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-cfg-scale"}, "F",
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-add-gumbel-noise"}, "F",
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "-lr", "--learning-rate" }, "ALPHA",
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
(double) params.lr.lr_min),
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"-wd", "--weight-decay"}, "WD",
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"-val-split", "--val-split"}, "FRACTION",
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"-epochs", "--epochs"}, "N",
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
[](common_params & params, const std::string & name) {
params.optimizer = common_opt_get_optimizer(name.c_str());
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
// presets
add_opt(common_arg( add_opt(common_arg(
{"--tts-oute-default"}, {"--tts-oute-default"},
string_format("use default OuteTTS models (note: can download weights from the internet)"), string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3863,39 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_TTS})); ).set_examples({LLAMA_EXAMPLE_TTS}));
add_opt(common_arg( add_opt(common_arg(
{"--embd-bge-small-en-default"}, {"--embd-gemma-default"},
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"), string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
[](common_params & params) { [](common_params & params) {
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf"; params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
params.embd_normalize = 2; params.port = 8011;
params.n_ctx = 512; params.n_ubatch = 2048;
params.verbose_prompt = true; params.n_batch = 2048;
params.embedding = true; params.n_parallel = 32;
} params.n_ctx = 2048*params.n_parallel;
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-e5-small-en-default"},
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
params.model.hf_file = "e5-small-v2-q8_0.gguf";
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-gte-small-default"},
string_format("use default gte-small model (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
params.model.hf_file = "gte-small-q8_0.gguf";
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true; params.verbose_prompt = true;
params.embedding = true; params.embedding = true;
} }
@@ -3990,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{ "--diffusion-steps" }, "N", {"--gpt-oss-20b-default"},
string_format("number of diffusion steps (default: %d)", params.diffusion.steps), string_format("use gpt-oss-20b (note: can download weights from the internet)"),
[](common_params & params, int value) { params.diffusion.steps = value; } [](common_params & params) {
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
add_opt(common_arg( params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
{ "--diffusion-visual" }, params.port = 8013;
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.n_ubatch = 2048;
params.diffusion.visual_mode ? "true" : "false"), params.n_batch = 32768;
[](common_params & params) { params.diffusion.visual_mode = true; } params.n_parallel = 2;
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); params.n_ctx = 131072*params.n_parallel;
params.sampling.temp = 1.0f;
params.sampling.top_p = 1.0f;
params.sampling.top_k = 0;
params.sampling.min_p = 0.01f;
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{ "--diffusion-eps" }, "F", {"--gpt-oss-120b-default"},
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps), string_format("use gpt-oss-120b (note: can download weights from the internet)"),
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); } [](common_params & params) {
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
add_opt(common_arg( params.port = 8013;
{ "--diffusion-algorithm" }, "N", params.n_ubatch = 2048;
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.n_batch = 32768;
params.diffusion.algorithm), params.n_parallel = 2;
[](common_params & params, int value) { params.diffusion.algorithm = value; } params.n_ctx = 131072*params.n_parallel;
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); params.sampling.temp = 1.0f;
add_opt(common_arg( params.sampling.top_p = 1.0f;
{ "--diffusion-alg-temp" }, "F", params.sampling.top_k = 0;
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp), params.sampling.min_p = 0.01f;
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); } params.use_jinja = true;
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{ "--diffusion-block-length" }, "N", {"--vision-gemma-4b-default"},
string_format("llada block length for generation (default: %d)", params.diffusion.block_length), string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
[](common_params & params, int value) { params.diffusion.block_length = value; } [](common_params & params) {
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
add_opt(common_arg( params.port = 8014;
{ "--diffusion-cfg-scale" }, "F", params.n_ctx = 0;
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale), params.use_jinja = true;
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); } }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{ "--diffusion-add-gumbel-noise" }, "F",
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
string_format(
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
(double) params.lr.lr0),
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
string_format(
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
(double) params.lr.lr_min),
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
string_format(
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
(double) params.lr.decay_epochs),
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg( add_opt(common_arg(
{ "-wd", "--weight-decay" }, "WD", {"--vision-gemma-12b-default"},
string_format( string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", [](common_params & params) {
(double) params.lr.wd), params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }) params.port = 8014;
.set_examples({ LLAMA_EXAMPLE_FINETUNE })); params.n_ctx = 0;
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION", params.use_jinja = true;
string_format("fraction of data to use as validation set for training (default: %.2g).", }
(double) params.val_split), ).set_examples({LLAMA_EXAMPLE_SERVER}));
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
[](common_params & params, const std::string & name) {
params.optimizer = common_opt_get_optimizer(name.c_str());
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
}
})
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
return ctx_arg; return ctx_arg;
} }

View File

@@ -426,7 +426,7 @@ struct common_params {
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc. int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT std::string public_path = ""; // NOLINT