OpenCL: add fused group_norm/norm, mul, add (#15314)

* add fused group_norm/norm, mul, add

* fix spacing

* revert rms_norm logic

* fix trailing whitespace
This commit is contained in:
rmatif
2025-08-27 08:36:05 +02:00
committed by GitHub
parent bcbddcd54f
commit 86076f92de
4 changed files with 399 additions and 4 deletions

View File

@@ -2789,6 +2789,49 @@ struct test_norm : public test_case {
}
};
// GGML_OP_NORM + GGML_OP_MUL + GGML_OP_ADD
struct test_norm_mul_add : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
float eps;
const bool broadcast;
std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return "NORM_MUL_ADD";
}
bool run_whole_graph() override { return true; }
std::string vars() override {
return VARS_TO_STR4(type, ne, eps, broadcast);
}
test_norm_mul_add(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {128, 2, 1, 1},
float eps = 1e-5f,
bool broadcast = false)
: type(type), ne(ne), eps(eps), broadcast(broadcast) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
std::array<int64_t, 4> broadcast_dims = {ne[0], ne[1] * 2, ne[2] * 2, ne[3] * 2};
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data());
ggml_tensor * w = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(a); ggml_set_param(w); ggml_set_param(b);
ggml_set_name(a, "a"); ggml_set_name(w, "w"); ggml_set_name(b, "b");
// Use a, w and b early to avoid OP_NONE in graph
a = ggml_add(ctx, ggml_add(ctx, a, w), b);
ggml_tensor * n = ggml_norm(ctx, a, eps);
ggml_tensor * m = ggml_mul(ctx, n, w);
ggml_tensor * out = ggml_add(ctx, m, b);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_RMS_NORM
struct test_rms_norm : public test_case {
const ggml_type type;
@@ -4475,6 +4518,44 @@ struct test_group_norm : public test_case {
}
};
// GGML_OP_GROUP_NORM + GGML_OP_MUL + GGML_OP_ADD
struct test_group_norm_mul_add : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
int num_groups;
float eps;
std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return "GROUP_NORM_MUL_ADD";
}
bool run_whole_graph() override { return true; }
std::string vars() override {
return VARS_TO_STR4(type, ne, num_groups, eps);
}
test_group_norm_mul_add(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {128, 1, 1, 1},
int num_groups = 4,
float eps = 1e-5f)
: type(type), ne(ne), num_groups(num_groups), eps(eps) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * w = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(a); ggml_set_param(w); ggml_set_param(b);
ggml_set_name(a, "a"); ggml_set_name(w, "w"); ggml_set_name(b, "b");
ggml_tensor * n = ggml_group_norm(ctx, a, num_groups, eps);
ggml_tensor * m = ggml_mul(ctx, n, w);
ggml_tensor * out = ggml_add(ctx, m, b);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_L2_NORM
struct test_l2_norm : public test_case {
const ggml_type type;
@@ -5865,6 +5946,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
}
for (uint32_t n : {1, 511, 1025, 8192, 33*512}) {
for (bool multi_add : {false, true}) {
@@ -6253,6 +6336,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {64, 64, 320, 1}));
test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_pad_reflect_1d());