llama: use FA + max. GPU layers by default (#15434)

* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
2025-10-27 08:21:30 +00:00 · 2025-08-30 16:32:10 +02:00
parent 38ad381f9f
commit e81b8e4b7f
19 changed files with 235 additions and 72 deletions
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -564,7 +564,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx                = params.n_ctx;
    ctx_params.n_batch              = params.n_batch;
    ctx_params.n_ubatch             = params.n_ubatch;
-    ctx_params.flash_attn           = params.flash_attn;
+    ctx_params.flash_attn_type      = params.flash_attn_type;
    ctx_params.no_perf              = params.no_perf;
    ctx_params.type_k               = params.cache_type_k;
    ctx_params.type_v               = params.cache_type_v;