mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
llama: use FA + max. GPU layers by default (#15434)
* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
This commit is contained in:
@@ -687,7 +687,8 @@ struct llm_graph_context {
|
||||
ggml_tensor * kq_mask,
|
||||
ggml_tensor * sinks, // [n_head_q]
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale) const;
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user