mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Merge branch 'master' into compilade/mamba2
This commit is contained in:
		| @@ -18,20 +18,22 @@ | ||||
| #include <ggml.h> | ||||
| #include <ggml-alloc.h> | ||||
| #include <ggml-backend.h> | ||||
| #include <ggml-cpp.h> | ||||
|  | ||||
| #include <algorithm> | ||||
| #include <array> | ||||
| #include <cfloat> | ||||
| #include <cstdint> | ||||
| #include <cstring> | ||||
| #include <cinttypes> | ||||
| #include <cstdint> | ||||
| #include <cstdio> | ||||
| #include <cstdlib> | ||||
| #include <cstring> | ||||
| #include <future> | ||||
| #include <memory> | ||||
| #include <random> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| #include <regex> | ||||
| #include <string> | ||||
| #include <thread> | ||||
| #include <future> | ||||
| #include <vector> | ||||
|  | ||||
| static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { | ||||
| @@ -257,6 +259,10 @@ static std::string var_to_str(ggml_type type) { | ||||
|     return ggml_type_name(type); | ||||
| } | ||||
|  | ||||
| static std::string var_to_str(ggml_prec prec) { | ||||
|     return prec == GGML_PREC_F32 ? "f32" : "def"; | ||||
| } | ||||
|  | ||||
| static std::string var_to_str(ggml_op_pool pool) { | ||||
|     switch (pool) { | ||||
|         case GGML_OP_POOL_AVG:  return "avg"; | ||||
| @@ -265,6 +271,14 @@ static std::string var_to_str(ggml_op_pool pool) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| static std::string var_to_str(ggml_scale_mode mode) { | ||||
|     switch (mode) { | ||||
|         case GGML_SCALE_MODE_NEAREST:  return "nearest"; | ||||
|         case GGML_SCALE_MODE_BILINEAR: return "bilinear"; | ||||
|         default:                      return std::to_string(mode); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #define VAR_TO_STR(x) (#x "=" + var_to_str(x)) | ||||
|  | ||||
| #define VARS_TO_STR1(a) VAR_TO_STR(a) | ||||
| @@ -467,6 +481,7 @@ struct test_case { | ||||
|  | ||||
|         // allocate | ||||
|         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); | ||||
|  | ||||
|         if (buf == NULL) { | ||||
|             printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); | ||||
|             ggml_free(ctx); | ||||
| @@ -588,14 +603,13 @@ struct test_case { | ||||
|             /* .mem_base = */ NULL, | ||||
|             /* .no_alloc = */ true, | ||||
|         }; | ||||
|         ggml_context * ctx = ggml_init(params); | ||||
|         ggml_context_ptr ctx(ggml_init(params)); // smart ptr | ||||
|         GGML_ASSERT(ctx); | ||||
|  | ||||
|         ggml_tensor * out = build_graph(ctx); | ||||
|         ggml_tensor * out = build_graph(ctx.get()); | ||||
|  | ||||
|         if (op_name != nullptr && op_desc(out) != op_name) { | ||||
|             //printf("  %s: skipping\n", op_desc(out).c_str()); | ||||
|             ggml_free(ctx); | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
| @@ -605,7 +619,6 @@ struct test_case { | ||||
|         // check if backends support op | ||||
|         if (!ggml_backend_supports_op(backend, out)) { | ||||
|             printf("not supported\n"); | ||||
|             ggml_free(ctx); | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
| @@ -618,22 +631,26 @@ struct test_case { | ||||
|         printf("%*s", last - len, ""); | ||||
|  | ||||
|         // allocate | ||||
|         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); | ||||
|         ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr | ||||
|  | ||||
|         if (buf == NULL) { | ||||
|             printf("failed to allocate tensors\n"); | ||||
|             ggml_free(ctx); | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         // randomize tensors | ||||
|         initialize_tensors(ctx); | ||||
|         initialize_tensors(ctx.get()); | ||||
|  | ||||
|         // build graph | ||||
|         ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false); | ||||
|         ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), graph_nodes, false); | ||||
|         ggml_build_forward_expand(gf, out); | ||||
|  | ||||
|         // warmup run | ||||
|         ggml_backend_graph_compute(backend, gf); | ||||
|         ggml_status status = ggml_backend_graph_compute(backend, gf); | ||||
|         if (status != GGML_STATUS_SUCCESS) { | ||||
|             fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         // determine number of runs | ||||
|         int n_runs; | ||||
| @@ -684,7 +701,11 @@ struct test_case { | ||||
|         int total_runs = 0; | ||||
|         do { | ||||
|             int64_t start_time = ggml_time_us(); | ||||
|             ggml_backend_graph_compute(backend, gf); | ||||
|             ggml_status status = ggml_backend_graph_compute(backend, gf); | ||||
|             if (status != GGML_STATUS_SUCCESS) { | ||||
|                 fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|                 return false; | ||||
|             } | ||||
|             int64_t end_time = ggml_time_us(); | ||||
|  | ||||
|             total_time_us += end_time - start_time; | ||||
| @@ -722,10 +743,6 @@ struct test_case { | ||||
|         } | ||||
|         printf("\n"); | ||||
|  | ||||
|         ggml_backend_buffer_free(buf); | ||||
|  | ||||
|         ggml_free(ctx); | ||||
|  | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
| @@ -738,17 +755,16 @@ struct test_case { | ||||
|             /* .mem_base = */ NULL, | ||||
|             /* .no_alloc = */ true, | ||||
|         }; | ||||
|         ggml_context * ctx = ggml_init(params); | ||||
|         ggml_context_ptr ctx(ggml_init(params)); // smart ptr | ||||
|         GGML_ASSERT(ctx); | ||||
|  | ||||
|         gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true); | ||||
|         gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true); | ||||
|         gf = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true); | ||||
|         gb = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true); | ||||
|  | ||||
|         ggml_tensor * out = build_graph(ctx); | ||||
|         ggml_tensor * out = build_graph(ctx.get()); | ||||
|  | ||||
|         if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) { | ||||
|             //printf("  %s: skipping\n", op_desc(out).c_str()); | ||||
|             ggml_free(ctx); | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
| @@ -756,7 +772,6 @@ struct test_case { | ||||
|         fflush(stdout); | ||||
|  | ||||
|         if (out->type != GGML_TYPE_F32) { | ||||
|             ggml_free(ctx); | ||||
|             printf("not supported [%s->type != FP32]\n", out->name); | ||||
|             return true; | ||||
|         } | ||||
| @@ -764,7 +779,7 @@ struct test_case { | ||||
|         // check if the backend supports the ops | ||||
|         bool supported = true; | ||||
|         bool any_params = false; | ||||
|         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { | ||||
|         for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) { | ||||
|             if (!ggml_backend_supports_op(backend, t)) { | ||||
|                 printf("not supported [%s] ", ggml_backend_name(backend)); | ||||
|                 supported = false; | ||||
| @@ -785,40 +800,38 @@ struct test_case { | ||||
|         } | ||||
|         if (!supported) { | ||||
|             printf("\n"); | ||||
|             ggml_free(ctx); | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
|         int64_t ngrads = 0; | ||||
|         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { | ||||
|         for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) { | ||||
|             if (t->flags & GGML_TENSOR_FLAG_PARAM) { | ||||
|                 ngrads += ggml_nelements(t); | ||||
|             } | ||||
|         } | ||||
|         if (ngrads > grad_nmax()) { | ||||
|             printf("skipping large tensors for speed \n"); | ||||
|             ggml_free(ctx); | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
|  | ||||
|         if (!ggml_is_scalar(out)) { | ||||
|             out = ggml_sum(ctx, out); | ||||
|             out = ggml_sum(ctx.get(), out); | ||||
|             ggml_set_name(out, "sum_of_out"); | ||||
|         } | ||||
|         ggml_set_loss(out); | ||||
|  | ||||
|         ggml_build_forward_expand(gf, out); | ||||
|         ggml_graph_cpy(gf, gb); | ||||
|         ggml_build_backward_expand(ctx, ctx, gb, false); | ||||
|         ggml_build_backward_expand(ctx.get(), ctx.get(), gb, false); | ||||
|         if (expect.size() != 1 || expect[0] != 0.0f) { | ||||
|             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); | ||||
|             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { | ||||
|             for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) { | ||||
|                 GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { | ||||
|         for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) { | ||||
|             if (!ggml_backend_supports_op(backend, t)) { | ||||
|                 printf("not supported [%s] ", ggml_backend_name(backend)); | ||||
|                 supported = false; | ||||
| @@ -832,27 +845,32 @@ struct test_case { | ||||
|         } | ||||
|         if (!supported) { | ||||
|             printf("\n"); | ||||
|             ggml_free(ctx); | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
|         // allocate | ||||
|         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); | ||||
|         ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr | ||||
|         if (buf == NULL) { | ||||
|             printf("failed to allocate tensors [%s] ", ggml_backend_name(backend)); | ||||
|             ggml_free(ctx); | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|  | ||||
|         initialize_tensors(ctx); // Randomizes all tensors (including gradients). | ||||
|         initialize_tensors(ctx.get()); // Randomizes all tensors (including gradients). | ||||
|         ggml_graph_reset(gb);    // Sets gradients to 1 if loss, 0 otherwise. | ||||
|  | ||||
|         ggml_backend_graph_compute(backend, gf); | ||||
|         ggml_backend_graph_compute(backend, gb); | ||||
|         ggml_status status = ggml_backend_graph_compute(backend, gf); | ||||
|         if (status != GGML_STATUS_SUCCESS) { | ||||
|             fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|             return false; | ||||
|         } | ||||
|         status = ggml_backend_graph_compute(backend, gb); | ||||
|         if (status != GGML_STATUS_SUCCESS) { | ||||
|             fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         bool ok = true; | ||||
|         for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { | ||||
|         for (struct ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { | ||||
|             if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) { | ||||
|                 continue; | ||||
|             } | ||||
| @@ -897,20 +915,36 @@ struct test_case { | ||||
|                 float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh | ||||
|  | ||||
|                 ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float)); | ||||
|                 ggml_backend_graph_compute(backend, gf); | ||||
|                 status = ggml_backend_graph_compute(backend, gf); | ||||
|                 if (status != GGML_STATUS_SUCCESS) { | ||||
|                     fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|                     return false; | ||||
|                 } | ||||
|                 ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out)); | ||||
|  | ||||
|                 ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float)); | ||||
|                 ggml_backend_graph_compute(backend, gf); | ||||
|                 status = ggml_backend_graph_compute(backend, gf); | ||||
|                 if (status != GGML_STATUS_SUCCESS) { | ||||
|                     fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|                     return false; | ||||
|                 } | ||||
|                 ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out)); | ||||
|  | ||||
|                 if (grad_precise()) { | ||||
|                     ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float)); | ||||
|                     ggml_backend_graph_compute(backend, gf); | ||||
|                     status = ggml_backend_graph_compute(backend, gf); | ||||
|                     if (status != GGML_STATUS_SUCCESS) { | ||||
|                         fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|                         return false; | ||||
|                     } | ||||
|                     ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out)); | ||||
|  | ||||
|                     ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float)); | ||||
|                     ggml_backend_graph_compute(backend, gf); | ||||
|                     status = ggml_backend_graph_compute(backend, gf); | ||||
|                     if (status != GGML_STATUS_SUCCESS) { | ||||
|                         fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status)); | ||||
|                         return false; | ||||
|                     } | ||||
|                     ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out)); | ||||
|  | ||||
|                     gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps); | ||||
| @@ -936,10 +970,6 @@ struct test_case { | ||||
|             printf("compare failed "); | ||||
|         } | ||||
|  | ||||
|         ggml_backend_buffer_free(buf); | ||||
|  | ||||
|         ggml_free(ctx); | ||||
|  | ||||
|         if (ok) { | ||||
|             printf("\033[1;32mOK\033[0m\n"); | ||||
|             return true; | ||||
| @@ -1441,11 +1471,13 @@ struct test_cpy : public test_case { | ||||
|     const ggml_type type_src; | ||||
|     const ggml_type type_dst; | ||||
|     const std::array<int64_t, 4> ne; | ||||
|     const std::array<int64_t, 4> permute; | ||||
|     const std::array<int64_t, 4> permute_src; | ||||
|     const std::array<int64_t, 4> permute_dst; | ||||
|     bool _src_use_permute; | ||||
|     bool _dst_use_permute; | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR4(type_src, type_dst, ne, permute); | ||||
|         return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst); | ||||
|     } | ||||
|  | ||||
|     double max_nmse_err() override { | ||||
| @@ -1458,9 +1490,11 @@ struct test_cpy : public test_case { | ||||
|  | ||||
|     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, | ||||
|             std::array<int64_t, 4> ne = {10, 10, 10, 1}, | ||||
|             std::array<int64_t, 4> permute = {0, 0, 0, 0}) | ||||
|         : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute), | ||||
|           _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} | ||||
|             std::array<int64_t, 4> permute_src = {0, 0, 0, 0}, | ||||
|             std::array<int64_t, 4> permute_dst = {0, 0, 0, 0}) | ||||
|         : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst), | ||||
|           _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), | ||||
|           _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); | ||||
| @@ -1468,13 +1502,18 @@ struct test_cpy : public test_case { | ||||
|         ggml_set_name(src, "src"); | ||||
|  | ||||
|         if (_src_use_permute) { | ||||
|             src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); | ||||
|             src = ggml_permute(ctx, src, permute_src[0], permute_src[1], permute_src[2], permute_src[3]); | ||||
|             ggml_set_name(src, "src_permuted"); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); | ||||
|         ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); | ||||
|         ggml_set_name(dst, "dst"); | ||||
|  | ||||
|         if (_dst_use_permute) { | ||||
|             dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]); | ||||
|             ggml_set_name(dst, "dst_permuted"); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor * out = ggml_cpy(ctx, src, dst); | ||||
|         ggml_set_name(out, "out"); | ||||
|  | ||||
| @@ -1928,6 +1967,40 @@ struct test_gla : public test_case { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| // GGML_OP_RWKV_WKV7 | ||||
| struct test_rwkv_wkv7 : public test_case { | ||||
|     const ggml_type type; | ||||
|  | ||||
|     const int64_t head_count; | ||||
|     const int64_t head_size; | ||||
|     const int64_t n_seq_tokens; | ||||
|     const int64_t n_seqs; | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); | ||||
|     } | ||||
|  | ||||
|     test_rwkv_wkv7(ggml_type type = GGML_TYPE_F32, | ||||
|             int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32) | ||||
|         : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         const int64_t n_tokens = n_seq_tokens * n_seqs; | ||||
|         ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data()); | ||||
|         ggml_tensor * w   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data()); | ||||
|         ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data()); | ||||
|         ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data()); | ||||
|         ggml_tensor * a   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data()); | ||||
|         ggml_tensor * b   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data()); | ||||
|         // Outputs may become NaN with long seqlen without these normalization | ||||
|         a = ggml_l2_norm(ctx, a, 1e-7F); | ||||
|         b = ggml_l2_norm(ctx, b, 1e-7F); | ||||
|         ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data()); | ||||
|         ggml_tensor * out = ggml_rwkv_wkv7(ctx, r, w, k, v, a, b, s); | ||||
|         return out; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| // GGML_OP_MUL_MAT | ||||
| struct test_mul_mat : public test_case { | ||||
|     const ggml_type type_a; | ||||
| @@ -1938,9 +2011,10 @@ struct test_mul_mat : public test_case { | ||||
|     const std::array<int64_t, 2> bs;  // dims 3 and 4 | ||||
|     const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4 | ||||
|     const std::array<int64_t, 4> per; // permutation of dimensions | ||||
|     const bool v; // whether a is a non-contiguous view | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per); | ||||
|         return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v); | ||||
|     } | ||||
|  | ||||
|     double max_nmse_err() override { | ||||
| @@ -1960,8 +2034,9 @@ struct test_mul_mat : public test_case { | ||||
|             int64_t m = 32, int64_t n = 32, int64_t k = 32, | ||||
|             std::array<int64_t, 2> bs = {10, 10}, | ||||
|             std::array<int64_t, 2> nr = {2, 2}, | ||||
|             std::array<int64_t, 4> per = {0, 1, 2, 3}) | ||||
|         : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {} | ||||
|             std::array<int64_t, 4> per = {0, 1, 2, 3}, | ||||
|             bool v = false) | ||||
|         : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         // C^T = A * B^T: (k, m) * (k, n) => (m, n) | ||||
| @@ -1971,6 +2046,7 @@ struct test_mul_mat : public test_case { | ||||
|         const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3); | ||||
|         if (npermuted > 0) { | ||||
|             GGML_ASSERT(npermuted == 2); | ||||
|             GGML_ASSERT(!v); // not handled | ||||
|             GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0); | ||||
|             GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0); | ||||
|  | ||||
| @@ -1994,7 +2070,13 @@ struct test_mul_mat : public test_case { | ||||
|             ggml_set_name(a, "a_permuted"); | ||||
|             ggml_set_name(b, "b_permuted"); | ||||
|         } else { | ||||
|             a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]); | ||||
|  | ||||
|             if (v) { | ||||
|                 a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]); | ||||
|                 a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0); | ||||
|             } else { | ||||
|                 a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]); | ||||
|             } | ||||
|             b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); | ||||
|             if (!ggml_is_quantized(type_a)) { | ||||
|                 if (bs[1] == 1 && nr[1] == 1) { | ||||
| @@ -2019,7 +2101,7 @@ struct test_mul_mat_id : public test_case { | ||||
|     const ggml_type type_b; | ||||
|     const int n_mats; | ||||
|     const int n_used; | ||||
|     const bool b; // brodcast b matrix | ||||
|     const bool b; // broadcast b matrix | ||||
|     const int64_t m; | ||||
|     const int64_t n; | ||||
|     const int64_t k; | ||||
| @@ -2554,6 +2636,8 @@ struct test_rope : public test_case { | ||||
|             } else { | ||||
|                 out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); | ||||
|             } | ||||
|  | ||||
|             // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp | ||||
|         } | ||||
|         ggml_set_name(out, "out"); | ||||
|  | ||||
| @@ -2904,15 +2988,16 @@ struct test_upscale : public test_case { | ||||
|     const std::array<int64_t, 4> ne; | ||||
|     const int32_t scale_factor; | ||||
|     const bool transpose; | ||||
|     const ggml_scale_mode mode; | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR4(type, ne, scale_factor, transpose); | ||||
|         return VARS_TO_STR5(type, ne, scale_factor, mode, transpose); | ||||
|     } | ||||
|  | ||||
|     test_upscale(ggml_type type = GGML_TYPE_F32, | ||||
|             std::array<int64_t, 4> ne = {512, 512, 3, 1}, | ||||
|             int32_t scale_factor = 2, bool transpose = false) | ||||
|         : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {} | ||||
|             int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false) | ||||
|         : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose), mode(mode) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); | ||||
| @@ -2923,7 +3008,7 @@ struct test_upscale : public test_case { | ||||
|             ggml_set_name(a, "a_transposed"); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor * out = ggml_upscale(ctx, a, scale_factor); | ||||
|         ggml_tensor * out = ggml_upscale(ctx, a, scale_factor, mode); | ||||
|         ggml_set_name(out, "out"); | ||||
|  | ||||
|         return out; | ||||
| @@ -2935,21 +3020,23 @@ struct test_upscale_ext : public test_case { | ||||
|     const ggml_type type; | ||||
|     const std::array<int64_t, 4> ne; | ||||
|     const std::array<int64_t, 4> ne_tgt; | ||||
|     const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST; | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR3(type, ne, ne_tgt); | ||||
|         return VARS_TO_STR4(type, ne, ne_tgt, mode); | ||||
|     } | ||||
|  | ||||
|     test_upscale_ext(ggml_type type = GGML_TYPE_F32, | ||||
|             std::array<int64_t, 4> ne     = {2, 5,  7, 11}, | ||||
|             std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13}) | ||||
|         : type(type), ne(ne), ne_tgt(ne_tgt) {} | ||||
|             std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13}, | ||||
|             ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST) | ||||
|         : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); | ||||
|         ggml_set_name(a, "a"); | ||||
|  | ||||
|         ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]); | ||||
|         ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode); | ||||
|         ggml_set_name(out, "out"); | ||||
|  | ||||
|         return out; | ||||
| @@ -2984,6 +3071,32 @@ struct test_group_norm : public test_case { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| // GGML_OP_L2_NORM | ||||
| struct test_l2_norm : public test_case { | ||||
|     const ggml_type type; | ||||
|     const std::array<int64_t, 4> ne; | ||||
|     const float eps; | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR2(type, ne); | ||||
|     } | ||||
|  | ||||
|     test_l2_norm(ggml_type type = GGML_TYPE_F32, | ||||
|             std::array<int64_t, 4> ne = {64, 64, 320, 1}, | ||||
|             float eps = 1e-12f) | ||||
|         : type(type), ne(ne), eps(eps) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); | ||||
|         ggml_set_name(a, "a"); | ||||
|  | ||||
|         ggml_tensor * out = ggml_l2_norm(ctx, a, eps); | ||||
|         ggml_set_name(out, "out"); | ||||
|  | ||||
|         return out; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| // GGML_OP_ACC | ||||
| struct test_acc : public test_case { | ||||
|     const ggml_type type; | ||||
| @@ -3147,7 +3260,8 @@ struct test_leaky_relu : public test_case { | ||||
|  | ||||
| // GGML_OP_FLASH_ATTN_EXT | ||||
| struct test_flash_attn_ext : public test_case { | ||||
|     const int64_t hs; // head size | ||||
|     const int64_t hsk; // K head size | ||||
|     const int64_t hsv; // V head size | ||||
|     const int64_t nh; // num heads | ||||
|     const int64_t nr; // repeat in Q, tests for grouped-query attention | ||||
|     const int64_t kv; // kv size | ||||
| @@ -3158,11 +3272,12 @@ struct test_flash_attn_ext : public test_case { | ||||
|     const float max_bias; // ALiBi | ||||
|     const float logit_softcap; // Gemma 2 | ||||
|  | ||||
|     const ggml_prec prec; | ||||
|     const ggml_type type_KV; | ||||
|     std::array<int32_t, 4> permute; | ||||
|  | ||||
|     std::string vars() override { | ||||
|         return VARS_TO_STR10(hs, nh, nr, kv, nb, mask, max_bias, logit_softcap, type_KV, permute); | ||||
|         return VARS_TO_STR12(hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, permute); | ||||
|     } | ||||
|  | ||||
|     double max_nmse_err() override { | ||||
| @@ -3172,17 +3287,18 @@ struct test_flash_attn_ext : public test_case { | ||||
|     uint64_t op_flops(ggml_tensor * t) override { | ||||
|         GGML_UNUSED(t); | ||||
|         // Just counting matmul costs: | ||||
|         // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head | ||||
|         return 2 * 2 * nh*nr * nb * hs * kv; | ||||
|         // Q*K^T is nb x hsk x kv, P*V is nb x kv x hsv, per head | ||||
|         return 2 * nh*nr * nb * (hsk + hsv) * kv; | ||||
|     } | ||||
|  | ||||
|     test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96, int64_t nb = 8, | ||||
|                         bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16, | ||||
|                         std::array<int32_t, 4> permute = {0, 1, 2, 3}) | ||||
|         : hs(hs), nh(nh), nr(nr), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV), permute(permute) {} | ||||
|     test_flash_attn_ext(int64_t hsk = 128, int64_t hsv = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96, int64_t nb = 8, | ||||
|                         bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_prec prec = GGML_PREC_F32, | ||||
|                         ggml_type type_KV = GGML_TYPE_F16, std::array<int32_t, 4> permute = {0, 1, 2, 3}) | ||||
|         : hsk(hsk), hsv(hsv), nh(nh), nr(nr), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), prec(prec), type_KV(type_KV), permute(permute) {} | ||||
|  | ||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||
|         const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV)); | ||||
|         const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV)); | ||||
|         const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV)); | ||||
|  | ||||
|         auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * { | ||||
|             int64_t ne[4] = {ne0, ne1, ne2, ne3}; | ||||
| @@ -3197,13 +3313,13 @@ struct test_flash_attn_ext : public test_case { | ||||
|             return t; | ||||
|         }; | ||||
|  | ||||
|         ggml_tensor * q = create_permuted(GGML_TYPE_F32, hs_padded, nb, nh*nr, 1); | ||||
|         ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr, 1); | ||||
|         ggml_set_name(q, "q"); | ||||
|  | ||||
|         ggml_tensor * k = create_permuted(type_KV,       hs_padded, kv, nh,    1); | ||||
|         ggml_tensor * k = create_permuted(type_KV,       hsk_padded, kv, nh,    1); | ||||
|         ggml_set_name(k, "k"); | ||||
|  | ||||
|         ggml_tensor * v = create_permuted(type_KV,       hs_padded, kv, nh,    1); | ||||
|         ggml_tensor * v = create_permuted(type_KV,       hsv_padded, kv, nh,    1); | ||||
|         ggml_set_name(v, "v"); | ||||
|  | ||||
|         ggml_tensor * m = nullptr; | ||||
| @@ -3212,7 +3328,8 @@ struct test_flash_attn_ext : public test_case { | ||||
|             ggml_set_name(m, "m"); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap); | ||||
|         ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hsk), max_bias, logit_softcap); | ||||
|         ggml_flash_attn_ext_set_prec(out, prec); | ||||
|         ggml_set_name(out, "out"); | ||||
|  | ||||
|         return out; | ||||
| @@ -3783,10 +3900,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|     std::default_random_engine rng(0); | ||||
|  | ||||
|     // unary ops | ||||
|     for (int v : {0, 1}) { | ||||
|         for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { | ||||
|             test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v)); | ||||
|             test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v)); | ||||
|     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { | ||||
|         for (int v : {0, 1}) { | ||||
|             for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { | ||||
|                 test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 128, 2, 2, 2 }, v)); | ||||
|                 test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 5, 7, 11, 13 }, v)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -3939,14 +4058,25 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|         test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim)); | ||||
|     } | ||||
|  | ||||
|     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { | ||||
|     // same-type copy | ||||
|     for (ggml_type type : all_types) { | ||||
|         const auto nk = ggml_blck_size(type); | ||||
|  | ||||
|         for (int k = 1; k < 4; ++k) { | ||||
|             test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4})); | ||||
|             test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3})); | ||||
|             test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3})); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { | ||||
|         for (ggml_type type_dst : all_types) { | ||||
|             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); | ||||
|             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows | ||||
|         } | ||||
|     } | ||||
|     for (ggml_type type_dst : {GGML_TYPE_F32}) { | ||||
|         for (ggml_type type_src : all_types) { | ||||
|     for (ggml_type type_src : all_types) { | ||||
|         for (ggml_type type_dst : {GGML_TYPE_F32}) { | ||||
|             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); | ||||
|             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows | ||||
|         } | ||||
| @@ -3973,37 +4103,38 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|             test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr)); | ||||
|         } | ||||
|     }; | ||||
|     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { | ||||
|         add_test_bin_bcast(type, {1, 1, 8, 1}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 1, 1}, {32, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 320, 320}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 1, 1}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 1}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 1, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 2, 1}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 1, 2}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 1, 2, 2}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2}); | ||||
|         add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2}); | ||||
|  | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2}); | ||||
|  | ||||
|     // stable diffusion | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1}); | ||||
|     add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1}); | ||||
|     //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1}); | ||||
|     //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1}); | ||||
|         // stable diffusion | ||||
|         add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1}); | ||||
|         add_test_bin_bcast(type, {1280, 16, 16, 1}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 256, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 1280, 1}, {16, 16, 1, 1}); | ||||
|         add_test_bin_bcast(type, {16, 16, 1280, 1}, {1, 1, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 1920, 1}, {16, 16, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 2560, 1}, {16, 16, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 1280, 1}, {32, 32, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 1920, 1}, {32, 32, 1, 1}); | ||||
|         add_test_bin_bcast(type, {1, 1, 640, 1}, {32, 32, 1, 1}); | ||||
|         add_test_bin_bcast(type, {5120, 1, 1, 1}, {1, 256, 1, 1}); | ||||
|         add_test_bin_bcast(type, {640, 1, 1, 1}, {1, 1, 1, 1}); | ||||
|         //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {1, 1, 1, 1}); | ||||
|         //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1}); | ||||
|     } | ||||
|  | ||||
|     test_cases.emplace_back(new test_add1()); | ||||
|     test_cases.emplace_back(new test_scale()); | ||||
| @@ -4015,8 +4146,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|             test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps)); | ||||
|         } | ||||
|         test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); | ||||
|         test_cases.emplace_back(new test_l2_norm      (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); | ||||
|     } | ||||
|  | ||||
|     test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); | ||||
|  | ||||
|     test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1})); | ||||
|     test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1})); | ||||
|     test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1})); | ||||
| @@ -4029,6 +4163,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|     test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4)); | ||||
|     test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4)); | ||||
|  | ||||
|     test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 1, 1)); | ||||
|     test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 1)); | ||||
|     test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 4)); | ||||
|     test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 128, 4)); | ||||
|  | ||||
|     test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 1, 1)); | ||||
|     test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 1)); | ||||
|     test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4)); | ||||
| @@ -4076,6 +4215,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); | ||||
|             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); | ||||
|             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); | ||||
|  | ||||
|             // test cases with large ne00/ne10 to cover stream-k fixup | ||||
|             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1})); | ||||
|             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 1024, {3, 2}, {1, 1})); | ||||
|             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1})); | ||||
|         } | ||||
|     } | ||||
|     for (ggml_type type_a : other_types) { | ||||
| @@ -4111,6 +4255,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1})); | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1})); | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1})); | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3})); | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3})); | ||||
|  | ||||
|     for (auto bs : {1,2,4,8}) { | ||||
|         for (auto nr : {1,4}) { | ||||
|             for (uint32_t m = 0; m < 2; ++m) { | ||||
|                 for (uint32_t k = 0; k < 2; ++k) { | ||||
|                     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  1}, {nr, 1}, {0, 2, 1, 3})); | ||||
|                     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  1}, {nr, 1}, {0, 1, 2, 3}, true)); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // sycl backend will limit task global_range < MAX_INT | ||||
|     // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion) | ||||
| @@ -4123,7 +4280,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|             for (int n_mats : {4, 8}) { | ||||
|                 for (int n_used : {1, 2, 4}) { | ||||
|                     for (bool b : {false, true}) { | ||||
|                         for (int n : {1, 32}) { | ||||
|                         for (int n : {1, 32, 129}) { | ||||
|                             int m = 512; | ||||
|                             int k = 256; | ||||
|                             test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k)); | ||||
| @@ -4168,12 +4325,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     test_cases.emplace_back(new test_sqr()); | ||||
|     test_cases.emplace_back(new test_sqrt()); | ||||
|     test_cases.emplace_back(new test_log()); | ||||
|     test_cases.emplace_back(new test_sin()); | ||||
|     test_cases.emplace_back(new test_cos()); | ||||
|     test_cases.emplace_back(new test_clamp()); | ||||
|     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { | ||||
|         test_cases.emplace_back(new test_sqr(type)); | ||||
|         test_cases.emplace_back(new test_sqrt(type)); | ||||
|         test_cases.emplace_back(new test_log(type)); | ||||
|         test_cases.emplace_back(new test_sin(type)); | ||||
|         test_cases.emplace_back(new test_cos(type)); | ||||
|         test_cases.emplace_back(new test_clamp(type)); | ||||
|     } | ||||
|  | ||||
|     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5)); | ||||
|     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5)); | ||||
| @@ -4289,12 +4448,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|         test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen | ||||
|     } | ||||
|  | ||||
|     for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) { | ||||
|         test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode)); | ||||
|         test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true)); | ||||
|         test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode)); | ||||
|     } | ||||
|  | ||||
|     test_cases.emplace_back(new test_sum()); | ||||
|     test_cases.emplace_back(new test_sum_rows()); | ||||
|     test_cases.emplace_back(new test_mean()); | ||||
|     test_cases.emplace_back(new test_upscale()); | ||||
|     test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true)); | ||||
|     test_cases.emplace_back(new test_upscale_ext()); | ||||
|     test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1})); | ||||
|     test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); | ||||
|     test_cases.emplace_back(new test_acc()); | ||||
| @@ -4304,23 +4466,34 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | ||||
|     test_cases.emplace_back(new test_timestep_embedding()); | ||||
|     test_cases.emplace_back(new test_leaky_relu()); | ||||
|  | ||||
|     for (int hs : { 64, 80, 128, 256, }) { | ||||
|         for (bool mask : { true, false } ) { | ||||
|             for (float max_bias : { 0.0f, 8.0f }) { | ||||
|                 if (!mask && max_bias > 0.0f) continue; | ||||
|                 for (float logit_softcap : {0.0f, 10.0f}) { | ||||
|                     if (hs != 128 && logit_softcap != 0.0f) continue; | ||||
|                     for (int nh : { 4, }) { | ||||
|                         for (int nr : { 1, 4, 16 }) { | ||||
|                             if (nr == 16 && hs != 128) continue; | ||||
|                             for (int kv : { 512, 1024, }) { | ||||
|                                 if (nr != 1 && kv != 512) continue; | ||||
|                                 for (int nb : { 1, 3, 32, 35, }) { | ||||
|                                     for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { | ||||
|                                         test_cases.emplace_back(new test_flash_attn_ext(hs, nh, nr, kv, nb, mask, max_bias, logit_softcap, type_KV)); | ||||
|                                         // run fewer test cases permuted | ||||
|                                         if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) { | ||||
|                                             test_cases.emplace_back(new test_flash_attn_ext(hs, nh, nr, kv, nb, mask, max_bias, logit_softcap, type_KV, {0, 2, 1, 3})); | ||||
|     for (int hsk : { 64, 80, 128, 192, 256, 576 }) { | ||||
|         for (int hsv : { 64, 80, 128, 192, 256, 512 }) { | ||||
|             if (hsk != 192 && hsk != 576 && hsk != hsv) continue; | ||||
|             if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; | ||||
|             if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA | ||||
|  | ||||
|             for (bool mask : { true, false } ) { | ||||
|                 for (float max_bias : { 0.0f, 8.0f }) { | ||||
|                     if (!mask && max_bias > 0.0f) continue; | ||||
|                     for (float logit_softcap : {0.0f, 10.0f}) { | ||||
|                         if (hsk != 128 && logit_softcap != 0.0f) continue; | ||||
|                         for (int nh : { 4, }) { | ||||
|                             for (int nr : { 1, 4, 16 }) { | ||||
|                                 if (nr == 16 && hsk != 128) continue; | ||||
|                                 for (int kv : { 512, 1024, }) { | ||||
|                                     if (nr != 1 && kv != 512) continue; | ||||
|                                     for (int nb : { 1, 3, 32, 35, }) { | ||||
|                                         for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) { | ||||
|                                             if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue; | ||||
|                                             for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) { | ||||
|                                                 test_cases.emplace_back(new test_flash_attn_ext( | ||||
|                                                     hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV)); | ||||
|                                                 // run fewer test cases permuted | ||||
|                                                 if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) { | ||||
|                                                     test_cases.emplace_back(new test_flash_attn_ext( | ||||
|                                                         hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3})); | ||||
|                                                 } | ||||
|                                             } | ||||
|                                         } | ||||
|                                     } | ||||
|                                 } | ||||
| @@ -4373,6 +4546,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() { | ||||
|     test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); | ||||
|     test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1})); | ||||
|  | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3})); | ||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, true)); | ||||
|  | ||||
|     for (int bs : {1, 2, 3, 4, 5, 8, 512}) { | ||||
|         for (ggml_type type_a : all_types) { | ||||
|             for (ggml_type type_b : {GGML_TYPE_F32}) { | ||||
| @@ -4393,12 +4569,38 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (int kv : { 4096, 8192, 16384, }) { | ||||
|         for (int hs : { 64, 128, }) { | ||||
|             for (int nr : { 1, 4, }) { | ||||
|                 test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return test_cases; | ||||
| } | ||||
|  | ||||
| static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { | ||||
| static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name, const char * params_filter) { | ||||
|     auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) { | ||||
|         if (params_filter == nullptr) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         std::regex params_filter_regex(params_filter); | ||||
|  | ||||
|         for (auto it = test_cases.begin(); it != test_cases.end();) { | ||||
|             if (!std::regex_search((*it)->vars(), params_filter_regex)) { | ||||
|                 it = test_cases.erase(it); | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             it++; | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     if (mode == MODE_TEST) { | ||||
|         auto test_cases = make_test_cases_eval(); | ||||
|         filter_test_cases(test_cases, params_filter); | ||||
|         ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); | ||||
|         if (backend_cpu == NULL) { | ||||
|             printf("  Failed to initialize CPU backend\n"); | ||||
| @@ -4420,6 +4622,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op | ||||
|  | ||||
|     if (mode == MODE_GRAD) { | ||||
|         auto test_cases = make_test_cases_eval(); | ||||
|         filter_test_cases(test_cases, params_filter); | ||||
|         size_t n_ok = 0; | ||||
|         for (auto & test : test_cases) { | ||||
|             if (test->eval_grad(backend, op_name)) { | ||||
| @@ -4433,6 +4636,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op | ||||
|  | ||||
|     if (mode == MODE_PERF) { | ||||
|         auto test_cases = make_test_cases_perf(); | ||||
|         filter_test_cases(test_cases, params_filter); | ||||
|         for (auto & test : test_cases) { | ||||
|             test->eval_perf(backend, op_name); | ||||
|         } | ||||
| @@ -4443,7 +4647,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op | ||||
| } | ||||
|  | ||||
| static void usage(char ** argv) { | ||||
|     printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]); | ||||
|     printf("Usage: %s [mode] [-o <op>] [-b <backend>] [-p <params regex>]\n", argv[0]); | ||||
|     printf("    valid modes:\n"); | ||||
|     printf("      - test (default, compare with CPU backend for correctness)\n"); | ||||
|     printf("      - grad (compare gradients from backpropagation with method of finite differences)\n"); | ||||
| @@ -4453,8 +4657,9 @@ static void usage(char ** argv) { | ||||
|  | ||||
| int main(int argc, char ** argv) { | ||||
|     test_mode mode = MODE_TEST; | ||||
|     const char * op_name_filter = NULL; | ||||
|     const char * backend_filter = NULL; | ||||
|     const char * op_name_filter = nullptr; | ||||
|     const char * backend_filter = nullptr; | ||||
|     const char * params_filter = nullptr; | ||||
|  | ||||
|     for (int i = 1; i < argc; i++) { | ||||
|         if (strcmp(argv[i], "test") == 0) { | ||||
| @@ -4477,6 +4682,13 @@ int main(int argc, char ** argv) { | ||||
|                 usage(argv); | ||||
|                 return 1; | ||||
|             } | ||||
|         } else if (strcmp(argv[i], "-p") == 0) { | ||||
|             if (i + 1 < argc) { | ||||
|                 params_filter = argv[++i]; | ||||
|             } else { | ||||
|                 usage(argv); | ||||
|                 return 1; | ||||
|             } | ||||
|         } else { | ||||
|             usage(argv); | ||||
|             return 1; | ||||
| @@ -4523,7 +4735,7 @@ int main(int argc, char ** argv) { | ||||
|         printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024); | ||||
|         printf("\n"); | ||||
|  | ||||
|         bool ok = test_backend(backend, mode, op_name_filter); | ||||
|         bool ok = test_backend(backend, mode, op_name_filter, params_filter); | ||||
|  | ||||
|         printf("  Backend %s: ", ggml_backend_name(backend)); | ||||
|         if (ok) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Francis Couture-Harpin
					Francis Couture-Harpin