mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : add simple option to enable CPU for MoE weights (--cpu-moe) (#14992)
This commit is contained in:
		| @@ -2380,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | ||||
|             } | ||||
|         } | ||||
|     )); | ||||
|     add_opt(common_arg( | ||||
|         {"--cpu-moe"}, | ||||
|         "use CPU for Mixture of Experts (MoE) weights", | ||||
|         [](common_params & params) { | ||||
|             params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()}); | ||||
|             params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()}); | ||||
|             params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()}); | ||||
|         } | ||||
|     ).set_env("LLAMA_ARG_CPU_MOE")); | ||||
|     add_opt(common_arg( | ||||
|         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", | ||||
|         "number of layers to store in VRAM", | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Diego Devesa
					Diego Devesa