Fix too relaxed check on CUDA "fast copy" (can_be_transposed) condition (#17332)

* Fix too relaxed check on CUDA "fast copy" (can_be_transposed) condition

* Argh.

* Making CISC happy ;)

* Integrate CONT tests

* Use loopy loop

* Skip new tests for (B)F16 for now.
This commit is contained in:
Piotr Wilkin (ilintar)
2025-11-19 10:36:33 +01:00
committed by GitHub
parent 980b7cd17e
commit 6fd4f95367
2 changed files with 29 additions and 17 deletions

View File

@@ -384,7 +384,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
char * src1_ddc = (char *) src1->data; char * src1_ddc = (char *) src1->data;
const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1); const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1; const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
if (src0->type == src1->type && contiguous_srcs) { if (src0->type == src1->type && contiguous_srcs) {
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));

View File

@@ -2776,24 +2776,34 @@ struct test_cpy : public test_case {
struct test_cont : public test_case { struct test_cont : public test_case {
const ggml_type type; const ggml_type type;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
bool use_view_slice;
std::string vars() override { std::string vars() override {
return VARS_TO_STR2(type, ne); return VARS_TO_STR3(type, ne, use_view_slice);
} }
test_cont(ggml_type type = GGML_TYPE_F32, test_cont(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 1}) std::array<int64_t, 4> ne = {10, 10, 10, 1},
: type(type), ne(ne) {} bool use_view_slice = false)
: type(type), ne(ne), use_view_slice(use_view_slice) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(src); ggml_set_param(src);
ggml_set_name(src, "src"); ggml_set_name(src, "src");
src = ggml_transpose(ctx, src);
ggml_set_name(src, "src_transposed");
ggml_tensor * out = ggml_cont(ctx, src); ggml_tensor * dst;
if (use_view_slice) {
dst = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
ggml_set_name(dst, "src_view_slice");
} else {
dst = ggml_transpose(ctx, src);
ggml_set_name(dst, "src_transposed");
}
ggml_tensor * out = ggml_cont(ctx, dst);
ggml_set_name(out, "out"); ggml_set_name(out, "out");
return out; return out;
@@ -6945,16 +6955,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
test_cases.emplace_back(new test_cont()); for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1})); for (bool use_view_slice : { true, false }) {
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5})); for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7})); {2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1})); if (use_view_slice && (type_dst == GGML_TYPE_F16 || type_dst == GGML_TYPE_BF16)) {
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5})); continue; // TODO: add after WebGPU is fixed
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7})); }
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1})); test_cases.emplace_back(new test_cont(type_dst, ne, use_view_slice));
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5})); }
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7})); }
}
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) { auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) { for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {