mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
llama : add --n-cpu-moe option (#15077)
* llama : add --n-cpu-moe option Keeps the MoE weights of the first N layers in the CPU
This commit is contained in:
@@ -24,6 +24,7 @@
|
|||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <list>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
}
|
}
|
||||||
throw std::invalid_argument("unknown buffer type");
|
throw std::invalid_argument("unknown buffer type");
|
||||||
}
|
}
|
||||||
// FIXME: this leaks memory
|
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||||
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
static std::list<std::string> buft_overrides;
|
||||||
|
buft_overrides.push_back(tensor_name);
|
||||||
|
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--cpu-moe"},
|
{"--cpu-moe", "-cmoe"},
|
||||||
"use CPU for Mixture of Experts (MoE) weights",
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
||||||
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
||||||
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CPU_MOE"));
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--n-cpu-moe", "-ncmoe"}, "N",
|
||||||
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
if (value < 0) {
|
||||||
|
throw std::invalid_argument("invalid value");
|
||||||
|
}
|
||||||
|
for (int i = 0; i < value; ++i) {
|
||||||
|
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||||
|
static std::list<std::string> buft_overrides;
|
||||||
|
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
||||||
|
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||||
"number of layers to store in VRAM",
|
"number of layers to store in VRAM",
|
||||||
|
|||||||
Reference in New Issue
Block a user