llama: use FA + max. GPU layers by default (#15434)

* llama: use max. GPU layers by default, auto -fa

* ggml-backend: abort instead of segfault
This commit is contained in:
Johannes Gäßler
2025-08-30 16:32:10 +02:00
committed by GitHub
parent 38ad381f9f
commit e81b8e4b7f
19 changed files with 235 additions and 72 deletions

View File

@@ -179,6 +179,14 @@ extern "C" {
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
enum llama_flash_attn_type {
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
};
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -303,6 +311,7 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -329,7 +338,6 @@ extern "C" {
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // use flash attention [EXPERIMENTAL]
bool no_perf; // measure performance timings
bool op_offload; // offload host tensor operations to device
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)