mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
llama: use FA + max. GPU layers by default (#15434)
* llama: use max. GPU layers by default, auto -fa * ggml-backend: abort instead of segfault
This commit is contained in:
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!params.batched_bench_output_jsonl) {
|
||||
LOG("\n");
|
||||
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG("\n");
|
||||
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
@@ -197,7 +197,7 @@ int main(int argc, char ** argv) {
|
||||
LOG(
|
||||
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||
n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||
);
|
||||
} else {
|
||||
|
||||
@@ -987,16 +987,16 @@ struct cmd_params_instance {
|
||||
llama_context_params to_llama_cparams() const {
|
||||
llama_context_params cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.n_ubatch = n_ubatch;
|
||||
cparams.type_k = type_k;
|
||||
cparams.type_v = type_v;
|
||||
cparams.offload_kqv = !no_kv_offload;
|
||||
cparams.flash_attn = flash_attn;
|
||||
cparams.embeddings = embeddings;
|
||||
cparams.op_offload = !no_op_offload;
|
||||
cparams.swa_full = false;
|
||||
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.n_ubatch = n_ubatch;
|
||||
cparams.type_k = type_k;
|
||||
cparams.type_v = type_v;
|
||||
cparams.offload_kqv = !no_kv_offload;
|
||||
cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
cparams.embeddings = embeddings;
|
||||
cparams.op_offload = !no_op_offload;
|
||||
cparams.swa_full = false;
|
||||
|
||||
return cparams;
|
||||
}
|
||||
|
||||
@@ -15,25 +15,26 @@ Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deseru
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.n_ctx = 256
|
||||
server.n_ctx = 512
|
||||
server.n_slots = 2
|
||||
server.n_predict = 128
|
||||
|
||||
|
||||
def test_ctx_shift_enabled():
|
||||
# the prompt is 301 tokens
|
||||
# the slot context is 256/2 = 128 tokens
|
||||
# the prompt is truncated to keep the last 109 tokens
|
||||
# 64 tokens are generated thanks to shifting the context when it gets full
|
||||
# the slot context is 512/2 = 256 tokens
|
||||
# the prompt is truncated to keep the last (301 - 256/2) = 173 tokens
|
||||
# 96 tokens are generated thanks to shifting the context when it gets full
|
||||
global server
|
||||
server.enable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 64,
|
||||
"n_predict": 96,
|
||||
"prompt": LONG_TEXT,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["timings"]["prompt_n"] == 109
|
||||
assert res.body["timings"]["predicted_n"] == 64
|
||||
assert res.body["timings"]["prompt_n"] == 173
|
||||
assert res.body["timings"]["predicted_n"] == 96
|
||||
assert res.body["truncated"] is True
|
||||
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ def create_server():
|
||||
server.model_draft = download_file(MODEL_DRAFT_FILE_URL)
|
||||
server.draft_min = 4
|
||||
server.draft_max = 8
|
||||
server.fa = "off"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
|
||||
@@ -66,7 +66,7 @@ class ServerProcess:
|
||||
n_slots: int | None = None
|
||||
ctk: str | None = None
|
||||
ctv: str | None = None
|
||||
fa: bool | None = None
|
||||
fa: str | None = None
|
||||
server_continuous_batching: bool | None = False
|
||||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
@@ -161,7 +161,7 @@ class ServerProcess:
|
||||
if self.ctv:
|
||||
server_args.extend(["-ctv", self.ctv])
|
||||
if self.fa is not None:
|
||||
server_args.append("-fa")
|
||||
server_args.extend(["-fa", self.fa])
|
||||
if self.n_predict:
|
||||
server_args.extend(["--n-predict", self.n_predict])
|
||||
if self.slot_save_path:
|
||||
@@ -427,7 +427,7 @@ class ServerPreset:
|
||||
server.n_batch = 300
|
||||
server.n_ubatch = 300
|
||||
server.n_slots = 2
|
||||
server.fa = True
|
||||
server.fa = "on"
|
||||
server.seed = 42
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
Reference in New Issue
Block a user