llama: use FA + max. GPU layers by default (#15434)

* llama: use max. GPU layers by default, auto -fa

* ggml-backend: abort instead of segfault
This commit is contained in:
Johannes Gäßler
2025-08-30 16:32:10 +02:00
committed by GitHub
parent 38ad381f9f
commit e81b8e4b7f
19 changed files with 235 additions and 72 deletions

View File

@@ -323,7 +323,7 @@ def run(
server.jinja = True
server.ctk = ctk
server.ctv = ctv
server.fa = fa
server.fa = "on" if fa else "off"
server.n_predict = n_predict
server.model_hf_repo = hf
server.model_hf_file = None