llama: use FA + max. GPU layers by default (#15434)

* llama: use max. GPU layers by default, auto -fa

* ggml-backend: abort instead of segfault
This commit is contained in:
Johannes Gäßler
2025-08-30 16:32:10 +02:00
committed by GitHub
parent 38ad381f9f
commit e81b8e4b7f
19 changed files with 235 additions and 72 deletions

View File

@@ -66,7 +66,7 @@ class ServerProcess:
n_slots: int | None = None
ctk: str | None = None
ctv: str | None = None
fa: bool | None = None
fa: str | None = None
server_continuous_batching: bool | None = False
server_embeddings: bool | None = False
server_reranking: bool | None = False
@@ -161,7 +161,7 @@ class ServerProcess:
if self.ctv:
server_args.extend(["-ctv", self.ctv])
if self.fa is not None:
server_args.append("-fa")
server_args.extend(["-fa", self.fa])
if self.n_predict:
server_args.extend(["--n-predict", self.n_predict])
if self.slot_save_path:
@@ -427,7 +427,7 @@ class ServerPreset:
server.n_batch = 300
server.n_ubatch = 300
server.n_slots = 2
server.fa = True
server.fa = "on"
server.seed = 42
server.server_embeddings = True
return server