llama: use FA + max. GPU layers by default (#15434)

* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
2025-10-27 08:21:30 +00:00 · 2025-08-30 16:32:10 +02:00
parent 38ad381f9f
commit e81b8e4b7f
19 changed files with 235 additions and 72 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
    add_opt(common_arg(
-        {"-fa", "--flash-attn"},
-        string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.flash_attn = true;
+        {"-fa", "--flash-attn"}, "FA",
+        string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
+        [](common_params & params, const std::string & value) {
+            if (value == "on" || value == "enabled") {
+                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
+            } else if (value == "off" || value == "disabled") {
+                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+            } else if (value == "auto") {
+                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
+            } else {
+                throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
+            }
        }
    ).set_env("LLAMA_ARG_FLASH_ATTN"));
    add_opt(common_arg(
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
-            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@@ -3527,10 +3526,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
-            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@@ -3545,8 +3541,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
            params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;