mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	graph : avoid huge warm-up graphs for MoE models (#14753)
* graph : avoid huge warm-up graphs for MoE models ggml-ci * cont : bump max nodes to 8x model tensors
This commit is contained in:
		| @@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { | ||||
| // | ||||
|  | ||||
| uint32_t llama_context::graph_max_nodes() const { | ||||
|     return std::max<uint32_t>(65536u, 5u*model.n_tensors()); | ||||
|     return std::max<uint32_t>(1024u, 8u*model.n_tensors()); | ||||
| } | ||||
|  | ||||
| llm_graph_result * llama_context::get_gf_res_reserve() const { | ||||
|   | ||||
| @@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn( | ||||
|     } | ||||
|  | ||||
|     // aggregate experts | ||||
|     // note: here we explicitly use hparams.n_expert_used instead of n_expert_used | ||||
|     //       to avoid potentially a large number of add nodes during warmup | ||||
|     //       ref: https://github.com/ggml-org/llama.cpp/pull/14753 | ||||
|     ggml_tensor * moe_out = nullptr; | ||||
|     for (int i = 0; i < n_expert_used; ++i) { | ||||
|     for (uint32_t i = 0; i < hparams.n_expert_used; ++i) { | ||||
|         ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, | ||||
|                 experts->nb[2], i*experts->nb[1]); | ||||
|  | ||||
| @@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (n_expert_used == 1) { | ||||
|     if (hparams.n_expert_used == 1) { | ||||
|         // avoid returning a non-contiguous tensor | ||||
|         moe_out = ggml_cont(ctx0, moe_out); | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov