llama : disable pipeline parallelism if compute buffer allocation fails (#16748)

2025-11-04 09:32:00 +00:00 · 2025-10-27 13:51:28 -07:00
parent 10640e31aa
commit 5a4ff43e7d
1 changed files with 8 additions and 3 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -268,9 +268,7 @@ llama_context::llama_context(
        if (pipeline_parallel) {
            LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
        }
-    }

-    if (!hparams.vocab_only) {
        llama_memory_context_ptr mctx;
        if (memory) {
            LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -342,9 +340,16 @@ llama_context::llama_context(
        // reserve pp (prompt processing) graph first so that buffers are only allocated once
        {
            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+            if (!gf) {
+                if (pipeline_parallel) {
+                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+                    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+                    gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                }
                if (!gf) {
                    throw std::runtime_error("failed to allocate compute pp buffers");
                }
+            }

            n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
            n_nodes_pp  = ggml_graph_n_nodes(gf);