llama: use FA + max. GPU layers by default (#15434)

* llama: use max. GPU layers by default, auto -fa

* ggml-backend: abort instead of segfault
This commit is contained in:
Johannes Gäßler
2025-08-30 16:32:10 +02:00
committed by GitHub
parent 38ad381f9f
commit e81b8e4b7f
19 changed files with 235 additions and 72 deletions

View File

@@ -687,7 +687,8 @@ struct llm_graph_context {
ggml_tensor * kq_mask,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale) const;
float kq_scale,
int il) const;
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;