mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
llama: use FA + max. GPU layers by default (#15434)
* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
This commit is contained in:
@@ -323,7 +323,7 @@ def run(
|
||||
server.jinja = True
|
||||
server.ctk = ctk
|
||||
server.ctv = ctv
|
||||
server.fa = fa
|
||||
server.fa = "on" if fa else "off"
|
||||
server.n_predict = n_predict
|
||||
server.model_hf_repo = hf
|
||||
server.model_hf_file = None
|
||||
|
||||
Reference in New Issue
Block a user