mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
CUDA: General GEMV fusion (#16715)
This commit is contained in:
@@ -4721,6 +4721,140 @@ struct test_topk_moe: public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
struct test_mul_mat_vec_fusion : public test_case {
|
||||
const ggml_type type;
|
||||
const ggml_glu_op glu_op;
|
||||
const int64_t m;
|
||||
const int64_t n;
|
||||
const int64_t k;
|
||||
const bool use_id;
|
||||
const int n_mats;
|
||||
const int n_used;
|
||||
const bool b; // broadcast b matrix (only for use_id)
|
||||
const bool with_bias;
|
||||
const bool with_gate;
|
||||
|
||||
test_mul_mat_vec_fusion(ggml_type type, ggml_glu_op op, int64_t m, int64_t n, int64_t k,
|
||||
bool use_id = false, int n_mats = 1, int n_used = 1, bool b = false, bool with_bias = false, bool with_gate = true)
|
||||
: type(type), glu_op(op), m(m), n(n), k(k), use_id(use_id), n_mats(n_mats), n_used(n_used), b(b), with_bias(with_bias), with_gate(with_gate) {
|
||||
if (use_id) {
|
||||
GGML_ASSERT(n_used <= n_mats);
|
||||
}
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR11(type, glu_op, m, n, k, use_id, n_mats, n_used, b, with_bias, with_gate);
|
||||
}
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "MUL_MAT_VEC_FUSION";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
ggml_tensor * build_gate(ggml_context * ctx, ggml_tensor * ffn_gate, ggml_tensor * ffn_up) {
|
||||
ggml_tensor * out = nullptr;
|
||||
if (with_gate) {
|
||||
if (glu_op == GGML_GLU_OP_SWIGLU_OAI) {
|
||||
constexpr float alpha = 1.702f;
|
||||
constexpr float limit = 7.0f;
|
||||
out = ggml_swiglu_oai(ctx, ffn_gate, ffn_up, alpha, limit);
|
||||
} else {
|
||||
out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
if (!use_id) {
|
||||
std::array<int64_t, 4> ne = {k, m, 1, 1};
|
||||
std::array<int64_t, 4> ne0 = {k, n, 1, 1};
|
||||
|
||||
ggml_tensor * cur = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
|
||||
ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr;
|
||||
ggml_tensor * up = ggml_new_tensor(ctx, type, 4, ne0.data());
|
||||
|
||||
ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur);
|
||||
if (with_bias) {
|
||||
std::array<int64_t, 4> bias_ne = {ffn_up->ne[0], 1, 1, 1};
|
||||
ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
|
||||
ffn_up = ggml_add(ctx, ffn_up, up_bias);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr;
|
||||
if (with_bias && with_gate) {
|
||||
std::array<int64_t, 4> bias_ne = {ffn_gate->ne[0], 1, 1, 1};
|
||||
ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
|
||||
ffn_gate = ggml_add(ctx, ffn_gate, gate_bias);
|
||||
}
|
||||
|
||||
ggml_tensor * out = with_gate ? build_gate(ctx, ffn_gate, ffn_up) : ffn_up;
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
} else {
|
||||
ggml_tensor * gates = ggml_new_tensor_3d(ctx, type, k, n, n_mats);
|
||||
ggml_tensor * ups = ggml_new_tensor_3d(ctx, type, k, n, n_mats);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, m);
|
||||
|
||||
if (n_used != n_mats) {
|
||||
ids = ggml_view_2d(ctx, ids, n_used, m, ids->nb[1], 0);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, this->b ? 1 : n_used, m);
|
||||
ggml_set_name(cur, "cur");
|
||||
|
||||
ggml_tensor * ffn_up = ggml_mul_mat_id(ctx, ups, cur, ids);
|
||||
if (with_bias) {
|
||||
ggml_tensor * up_bias_param = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ffn_up->ne[0], n_mats);
|
||||
ffn_up = ggml_add_id(ctx, ffn_up, up_bias_param, ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_gate = with_gate? ggml_mul_mat_id(ctx, gates, cur, ids) : nullptr;
|
||||
if (with_bias && with_gate) {
|
||||
ggml_tensor * gate_bias_param = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ffn_gate->ne[0], n_mats);
|
||||
ffn_gate = ggml_add_id(ctx, ffn_gate, gate_bias_param, ids);
|
||||
}
|
||||
|
||||
ggml_tensor * out = with_gate ? build_gate(ctx, ffn_gate, ffn_up) : ffn_up;
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
if (!use_id) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
} else {
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
if (ggml_is_view_op(t->op)) { continue; }
|
||||
// ids
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i % n_mats;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-3;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SUM
|
||||
struct test_sum : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -6983,6 +7117,33 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
|
||||
for (ggml_type type : base_types) {
|
||||
for (bool with_gate : {false, true}) {
|
||||
for (bool use_id : {false, true}) {
|
||||
for (bool b : {false, true}) {
|
||||
if (!use_id && b) {
|
||||
continue;
|
||||
}
|
||||
for (bool with_bias : {false, true}) {
|
||||
if (!with_gate && !with_bias) {
|
||||
continue;
|
||||
}
|
||||
for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) {
|
||||
if (!with_bias && glu_op == GGML_GLU_OP_SWIGLU_OAI) {
|
||||
continue;
|
||||
}
|
||||
if (!with_gate && glu_op != GGML_GLU_OP_SWIGLU) {
|
||||
continue;
|
||||
}
|
||||
test_cases.emplace_back(new test_mul_mat_vec_fusion(type, glu_op, 1, 32, 256,
|
||||
use_id, 16, 8, b, with_bias, with_gate));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (bool with_norm : {false, true}) {
|
||||
test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
|
||||
test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
|
||||
|
||||
Reference in New Issue
Block a user