mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
llama: use FA + max. GPU layers by default (#15434)
* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
This commit is contained in:
@@ -66,7 +66,7 @@ class ServerProcess:
|
||||
n_slots: int | None = None
|
||||
ctk: str | None = None
|
||||
ctv: str | None = None
|
||||
fa: bool | None = None
|
||||
fa: str | None = None
|
||||
server_continuous_batching: bool | None = False
|
||||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
@@ -161,7 +161,7 @@ class ServerProcess:
|
||||
if self.ctv:
|
||||
server_args.extend(["-ctv", self.ctv])
|
||||
if self.fa is not None:
|
||||
server_args.append("-fa")
|
||||
server_args.extend(["-fa", self.fa])
|
||||
if self.n_predict:
|
||||
server_args.extend(["--n-predict", self.n_predict])
|
||||
if self.slot_save_path:
|
||||
@@ -427,7 +427,7 @@ class ServerPreset:
|
||||
server.n_batch = 300
|
||||
server.n_ubatch = 300
|
||||
server.n_slots = 2
|
||||
server.fa = True
|
||||
server.fa = "on"
|
||||
server.seed = 42
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
Reference in New Issue
Block a user