llama : add --n-cpu-moe option (#15077)

* llama : add --n-cpu-moe option Keeps the MoE weights of the first N layers in the CPU
2025-10-28 08:31:25 +00:00 · 2025-08-04 16:05:36 -07:00
parent 19f68fa5a4
commit ec428b02c3
1 changed files with 23 additions and 7 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -24,6 +24,7 @@
 #include <cstdarg>
 #include <filesystem>
 #include <fstream>
 #include <list>
 #include <regex>
 #include <set>
 #include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                    }
                    throw std::invalid_argument("unknown buffer type");
                }
-                // FIXME: this leaks memory
+                // keep strings alive and avoid leaking memory by storing them in a static vector
-                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+                static std::list<std::string> buft_overrides;
                buft_overrides.push_back(tensor_name);
                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
            }
        }
    ));
    add_opt(common_arg(
-        {"--cpu-moe"},
+        {"--cpu-moe", "-cmoe"},
-        "use CPU for Mixture of Experts (MoE) weights",
+        "keep all Mixture of Experts (MoE) weights in the CPU",
        [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
        }
    ).set_env("LLAMA_ARG_CPU_MOE"));
    add_opt(common_arg(
        {"--n-cpu-moe", "-ncmoe"}, "N",
        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
        [](common_params & params, int value) {
            if (value < 0) {
                throw std::invalid_argument("invalid value");
            }
            for (int i = 0; i < value; ++i) {
                // keep strings alive and avoid leaking memory by storing them in a static vector
                static std::list<std::string> buft_overrides;
                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
            }
        }
    ).set_env("LLAMA_ARG_N_CPU_MOE"));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",