mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	vulkan: Reuse conversion results in prealloc_y (#15410)
* vulkan: Reuse conversion results in prealloc_y Cache the pipeline and tensor that were most recently used to fill prealloc_y, and skip the conversion if the current pipeline/tensor match. * don't use shared pointer for prealloc_y_last_pipeline_used
This commit is contained in:
		| @@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context { | |||||||
|     vk::Fence fence, almost_ready_fence; |     vk::Fence fence, almost_ready_fence; | ||||||
|     bool almost_ready_fence_pending {}; |     bool almost_ready_fence_pending {}; | ||||||
|  |  | ||||||
|  |     // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. | ||||||
|  |     vk_pipeline_struct * prealloc_y_last_pipeline_used {}; | ||||||
|  |     const ggml_tensor * prealloc_y_last_tensor_used {}; | ||||||
|  |  | ||||||
|     vk_buffer buffer_pool[MAX_VK_BUFFERS]; |     vk_buffer buffer_pool[MAX_VK_BUFFERS]; | ||||||
|  |  | ||||||
|     vk_context_ref compute_ctx; |     vk_context_ref compute_ctx; | ||||||
| @@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); |         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || | ||||||
|  |             ctx->prealloc_y_last_tensor_used != src1) { | ||||||
|  |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||||
|  |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|  |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|     if (quantize_y) { |     if (quantize_y) { | ||||||
|         ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13); |         if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() || | ||||||
|  |             ctx->prealloc_y_last_tensor_used != src1) { | ||||||
|  |             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13); | ||||||
|  |             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); | ||||||
|  |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     uint32_t stride_batch_x = ne00*ne01; |     uint32_t stride_batch_x = ne00*ne01; | ||||||
| @@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); |         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || | ||||||
|  |             ctx->prealloc_y_last_tensor_used != src1) { | ||||||
|  |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||||
|  |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|  |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride |     // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride | ||||||
| @@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); |             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || | ||||||
|  |             ctx->prealloc_y_last_tensor_used != src1) { | ||||||
|  |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||||
|  |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|  |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     uint32_t stride_batch_x = ne00*ne01; |     uint32_t stride_batch_x = ne00*ne01; | ||||||
| @@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); |         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |         if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || | ||||||
|  |             ctx->prealloc_y_last_tensor_used != src1) { | ||||||
|  |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||||
|  |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|  |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     uint32_t stride_batch_y = ne10*ne11; |     uint32_t stride_batch_y = ne10*ne11; | ||||||
| @@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx | |||||||
|         GGML_ASSERT(nei0 <= 4096); |         GGML_ASSERT(nei0 <= 4096); | ||||||
|         const uint32_t split_size = std::min(nei1, 4096u / nei0); |         const uint32_t split_size = std::min(nei1, 4096u / nei0); | ||||||
|  |  | ||||||
|         ggml_tensor src1_copy = *src1; |         if (split_size == nei1) { | ||||||
|         ggml_tensor src2_copy = *src2; |             ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); | ||||||
|         ggml_tensor dst_copy = *dst; |         } else { | ||||||
|  |             ggml_tensor src1_copy = *src1; | ||||||
|  |             ggml_tensor src2_copy = *src2; | ||||||
|  |             ggml_tensor dst_copy = *dst; | ||||||
|  |  | ||||||
|         for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { |             for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { | ||||||
|             const uint32_t n_tokens = std::min(split_size, nei1 - token_start); |                 const uint32_t n_tokens = std::min(split_size, nei1 - token_start); | ||||||
|  |  | ||||||
|             src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; |                 src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; | ||||||
|             src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; |                 src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; | ||||||
|             dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; |                 dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; | ||||||
|  |  | ||||||
|             src1_copy.ne[2] = n_tokens; |                 src1_copy.ne[2] = n_tokens; | ||||||
|             src2_copy.ne[1] = n_tokens; |                 src2_copy.ne[1] = n_tokens; | ||||||
|             dst_copy.ne[2] = n_tokens; |                 dst_copy.ne[2] = n_tokens; | ||||||
|  |  | ||||||
|             ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); |                 ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); | ||||||
|  |                 // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor | ||||||
|  |                 ctx->prealloc_y_last_pipeline_used = {}; | ||||||
|  |                 ctx->prealloc_y_last_tensor_used = nullptr; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { | |||||||
|         ggml_vk_pool_free(ctx, buffer); |         ggml_vk_pool_free(ctx, buffer); | ||||||
|     } |     } | ||||||
|     ctx->gc.temp_buffers.clear(); |     ctx->gc.temp_buffers.clear(); | ||||||
|  |     ctx->prealloc_y_last_pipeline_used = {}; | ||||||
|  |  | ||||||
|     ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); |     ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); | ||||||
|     ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); |     ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); | ||||||
| @@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { | |||||||
|     ggml_vk_destroy_buffer(ctx->prealloc_x); |     ggml_vk_destroy_buffer(ctx->prealloc_x); | ||||||
|     ggml_vk_destroy_buffer(ctx->prealloc_y); |     ggml_vk_destroy_buffer(ctx->prealloc_y); | ||||||
|     ggml_vk_destroy_buffer(ctx->prealloc_split_k); |     ggml_vk_destroy_buffer(ctx->prealloc_split_k); | ||||||
|  |     ctx->prealloc_y_last_pipeline_used = nullptr; | ||||||
|  |  | ||||||
|     for (auto& buffer : ctx->buffer_pool) { |     for (auto& buffer : ctx->buffer_pool) { | ||||||
|         ggml_vk_destroy_buffer(buffer); |         ggml_vk_destroy_buffer(buffer); | ||||||
| @@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg | |||||||
|         compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0); |         compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     ctx->prealloc_y_last_pipeline_used = nullptr; | ||||||
|  |     ctx->prealloc_y_last_tensor_used = nullptr; | ||||||
|  |  | ||||||
|     // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. |     // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. | ||||||
|     // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB |     // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB | ||||||
|     // (and scaled down based on model size, so smaller models submit earlier). |     // (and scaled down based on model size, so smaller models submit earlier). | ||||||
|   | |||||||
| @@ -3098,9 +3098,10 @@ struct test_mul_mat : public test_case { | |||||||
|     const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4 |     const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4 | ||||||
|     const std::array<int64_t, 4> per; // permutation of dimensions |     const std::array<int64_t, 4> per; // permutation of dimensions | ||||||
|     const bool v; // whether a and b are non-contiguous views |     const bool v; // whether a and b are non-contiguous views | ||||||
|  |     const uint32_t o; // number of outputs | ||||||
|  |  | ||||||
|     std::string vars() override { |     std::string vars() override { | ||||||
|         return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v); |         return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     double max_nmse_err() override { |     double max_nmse_err() override { | ||||||
| @@ -3121,8 +3122,8 @@ struct test_mul_mat : public test_case { | |||||||
|             std::array<int64_t, 2> bs = {10, 10}, |             std::array<int64_t, 2> bs = {10, 10}, | ||||||
|             std::array<int64_t, 2> nr = {2, 2}, |             std::array<int64_t, 2> nr = {2, 2}, | ||||||
|             std::array<int64_t, 4> per = {0, 1, 2, 3}, |             std::array<int64_t, 4> per = {0, 1, 2, 3}, | ||||||
|             bool v = false) |             bool v = false, uint32_t o = 1) | ||||||
|         : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {} |         : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {} | ||||||
|  |  | ||||||
|     ggml_tensor * build_graph(ggml_context * ctx) override { |     ggml_tensor * build_graph(ggml_context * ctx) override { | ||||||
|         // C^T = A * B^T: (k, m) * (k, n) => (m, n) |         // C^T = A * B^T: (k, m) * (k, n) => (m, n) | ||||||
| @@ -3186,9 +3187,21 @@ struct test_mul_mat : public test_case { | |||||||
|  |  | ||||||
|         ggml_tensor * out = ggml_mul_mat(ctx, a, b); |         ggml_tensor * out = ggml_mul_mat(ctx, a, b); | ||||||
|         ggml_set_name(out, "out"); |         ggml_set_name(out, "out"); | ||||||
|  |         for (uint32_t i = 1; i < o; ++i) { | ||||||
|  |             ggml_tensor * out2 = ggml_mul_mat(ctx, a, b); | ||||||
|  |             ggml_set_name(out2, "out2"); | ||||||
|  |             out = ggml_add(ctx, out, out2); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         return out; |         return out; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     bool run_whole_graph() override { return o > 1; } | ||||||
|  |  | ||||||
|  |     std::string op_desc(ggml_tensor * t) override { | ||||||
|  |         GGML_UNUSED(t); | ||||||
|  |         return ggml_op_name(GGML_OP_MUL_MAT); | ||||||
|  |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // GGML_OP_MUL_MAT_ID | // GGML_OP_MUL_MAT_ID | ||||||
| @@ -3201,9 +3214,10 @@ struct test_mul_mat_id : public test_case { | |||||||
|     const int64_t m; |     const int64_t m; | ||||||
|     const int64_t n; |     const int64_t n; | ||||||
|     const int64_t k; |     const int64_t k; | ||||||
|  |     const uint32_t o; // number of outputs | ||||||
|  |  | ||||||
|     std::string vars() override { |     std::string vars() override { | ||||||
|         return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k); |         return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     double max_nmse_err() override { |     double max_nmse_err() override { | ||||||
| @@ -3217,9 +3231,9 @@ struct test_mul_mat_id : public test_case { | |||||||
|  |  | ||||||
|     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, |     test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, | ||||||
|             int n_mats = 8, int n_used = 2, bool b = false, |             int n_mats = 8, int n_used = 2, bool b = false, | ||||||
|             int64_t m = 32, int64_t n = 32, int64_t k = 32) |             int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1) | ||||||
|         : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b), |         : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b), | ||||||
|             m(m), n(n), k(k) { |             m(m), n(n), k(k), o(o) { | ||||||
|             GGML_ASSERT(n_used <= n_mats); |             GGML_ASSERT(n_used <= n_mats); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -3241,6 +3255,13 @@ struct test_mul_mat_id : public test_case { | |||||||
|         ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids); |         ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids); | ||||||
|         ggml_set_name(out, "out"); |         ggml_set_name(out, "out"); | ||||||
|  |  | ||||||
|  |         for (uint32_t i = 1; i < o; ++i) { | ||||||
|  |             ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); | ||||||
|  |             ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids); | ||||||
|  |             ggml_set_name(out2, "out2"); | ||||||
|  |             out = ggml_add(ctx, out, out2); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         return out; |         return out; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -3264,6 +3285,13 @@ struct test_mul_mat_id : public test_case { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     bool run_whole_graph() override { return o > 1; } | ||||||
|  |  | ||||||
|  |     std::string op_desc(ggml_tensor * t) override { | ||||||
|  |         GGML_UNUSED(t); | ||||||
|  |         return ggml_op_name(GGML_OP_MUL_MAT_ID); | ||||||
|  |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // GGML_OP_OUT_PROD | // GGML_OP_OUT_PROD | ||||||
| @@ -5798,6 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | |||||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1})); |     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1})); | ||||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3})); |     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3})); | ||||||
|     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3})); |     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3})); | ||||||
|  |     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1,  1}, {1, 1}, {0, 1, 2, 3}, true, 3)); | ||||||
|  |  | ||||||
|     for (auto bs2 : {1,3}) { |     for (auto bs2 : {1,3}) { | ||||||
|         for (auto bs : {1,2,4,8}) { |         for (auto bs : {1,2,4,8}) { | ||||||
| @@ -5826,6 +5855,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1)); |     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1)); | ||||||
|  |     test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3)); | ||||||
|  |  | ||||||
|     for (ggml_type type_a : base_types) { |     for (ggml_type type_a : base_types) { | ||||||
|         for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { |         for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jeff Bolz
					Jeff Bolz