mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	llama : disable pipeline parallelism if compute buffer allocation fails (#16748)
This commit is contained in:
		@@ -268,9 +268,7 @@ llama_context::llama_context(
 | 
			
		||||
        if (pipeline_parallel) {
 | 
			
		||||
            LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (!hparams.vocab_only) {
 | 
			
		||||
        llama_memory_context_ptr mctx;
 | 
			
		||||
        if (memory) {
 | 
			
		||||
            LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
 | 
			
		||||
@@ -342,9 +340,16 @@ llama_context::llama_context(
 | 
			
		||||
        // reserve pp (prompt processing) graph first so that buffers are only allocated once
 | 
			
		||||
        {
 | 
			
		||||
            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
 | 
			
		||||
            if (!gf) {
 | 
			
		||||
                if (pipeline_parallel) {
 | 
			
		||||
                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
 | 
			
		||||
                    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
 | 
			
		||||
                    gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
 | 
			
		||||
                }
 | 
			
		||||
                if (!gf) {
 | 
			
		||||
                    throw std::runtime_error("failed to allocate compute pp buffers");
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
 | 
			
		||||
            n_nodes_pp  = ggml_graph_n_nodes(gf);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user