mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	change the reorder tensor from init to execute OP (#13003)
This commit is contained in:
		| @@ -313,7 +313,6 @@ struct ggml_backend_sycl_context { | |||||||
|     int device; |     int device; | ||||||
|     std::string name; |     std::string name; | ||||||
|     optimize_feature opt_feature; |     optimize_feature opt_feature; | ||||||
|     bool optimized_graph=false; |  | ||||||
|  |  | ||||||
|     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; |     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -192,7 +192,7 @@ static void ggml_check_sycl() try { | |||||||
|  |  | ||||||
|     if (!initialized) { |     if (!initialized) { | ||||||
|         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); |         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); | ||||||
|         g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); |         g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); | ||||||
|         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); |         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); | ||||||
|         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); |         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); | ||||||
|         GGML_LOG_INFO("Running with Environment Variables:\n"); |         GGML_LOG_INFO("Running with Environment Variables:\n"); | ||||||
| @@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void reorder_qw(char *data_device, const int ncols, const int nrows, | ||||||
|  |                 size_t size, size_t offset, dpct::queue_ptr stream) { | ||||||
|  |     auto tmp_buf = sycl::malloc_shared<char>(size, *stream); | ||||||
|  |     SYCL_CHECK( | ||||||
|  |         CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) | ||||||
|  |             .wait())); | ||||||
|  |     GGML_ASSERT((size % sizeof(block_q4_0) == 0)); | ||||||
|  |     GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); | ||||||
|  |     int offset_blks = offset / sizeof(block_q4_0); | ||||||
|  |     auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; | ||||||
|  |     auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; | ||||||
|  |  | ||||||
|  |     stream->parallel_for( | ||||||
|  |         size / sizeof(block_q4_0), | ||||||
|  |             [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { | ||||||
|  |             const block_q4_0* x = (const block_q4_0*)tmp_buf; | ||||||
|  |             const int ib = i; | ||||||
|  |  | ||||||
|  |             for (int j = 0; j < QK4_0/2; j ++) | ||||||
|  |             { | ||||||
|  |                 *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; | ||||||
|  |             } | ||||||
|  |             *(d_ptr + ib) = x[ib].d; | ||||||
|  |         }); | ||||||
|  |  | ||||||
|  |     sycl::free(tmp_buf, *stream); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { | ||||||
|  |     char*data_device = (char*)src0->data; | ||||||
|  |     size_t ncols = src0->ne[0]; | ||||||
|  |     size_t nrows = src0->ne[1]; | ||||||
|  |     size_t size = ggml_nbytes(src0); | ||||||
|  |  | ||||||
|  |     reorder_qw(data_device, ncols, nrows, size, 0, stream); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | * This function could be called when the OP (mul_mat) function support reorder optimizition. | ||||||
|  | */ | ||||||
|  | static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, | ||||||
|  |     ggml_tensor * dst) { | ||||||
|  |     if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT | ||||||
|  |         ctx->opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf. | ||||||
|  |         dst->op == GGML_OP_MUL_MAT &&    //limit to some supported cases of Q4_0, to do for more cases. | ||||||
|  |         src0->type == GGML_TYPE_Q4_0 && | ||||||
|  |         src1->ne[2]==1 && src1->ne[3]==1) { | ||||||
|  |  | ||||||
|  |         ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; | ||||||
|  |         if (!extra) return; //only happen in CI/UT permute case. | ||||||
|  |  | ||||||
|  |         if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder. | ||||||
|  |  | ||||||
|  |         reorder_qw(src0, ctx->stream()); | ||||||
|  |         extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|  |  | ||||||
|     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); |     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); | ||||||
| @@ -2914,6 +2972,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |||||||
|         // KQ + KQV multi-batch |         // KQ + KQV multi-batch | ||||||
|         ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); |         ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); | ||||||
|     } else if (use_dequantize_mul_mat_vec) { |     } else if (use_dequantize_mul_mat_vec) { | ||||||
|  |         opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. | ||||||
|         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); |         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); | ||||||
|         // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); |         // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); | ||||||
|     } else if (use_mul_mat_vec_q) { |     } else if (use_mul_mat_vec_q) { | ||||||
| @@ -2921,6 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |||||||
|     } else if (use_mul_mat_q) { |     } else if (use_mul_mat_q) { | ||||||
|         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); |         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); | ||||||
|     } else { |     } else { | ||||||
|  |         opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. | ||||||
|         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); |         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) { | |||||||
|   std::exit(1); |   std::exit(1); | ||||||
| } | } | ||||||
|  |  | ||||||
| static void reorder_qw(char *data_device, const int ncols, const int nrows, |  | ||||||
|                 size_t size, size_t offset, dpct::queue_ptr stream) { |  | ||||||
|     auto tmp_buf = sycl::malloc_shared<char>(size, *stream); |  | ||||||
|     SYCL_CHECK( |  | ||||||
|         CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) |  | ||||||
|             .wait())); |  | ||||||
|     GGML_ASSERT((size % sizeof(block_q4_0) == 0)); |  | ||||||
|     GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); |  | ||||||
|     int offset_blks = offset / sizeof(block_q4_0); |  | ||||||
|     auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; |  | ||||||
|     auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; |  | ||||||
|  |  | ||||||
|     stream->parallel_for( |  | ||||||
|         size / sizeof(block_q4_0), |  | ||||||
|             [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { |  | ||||||
|             const block_q4_0* x = (const block_q4_0*)tmp_buf; |  | ||||||
|             const int ib = i; |  | ||||||
|  |  | ||||||
|             for (int j = 0; j < QK4_0/2; j ++) |  | ||||||
|             { |  | ||||||
|                 *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; |  | ||||||
|             } |  | ||||||
|             *(d_ptr + ib) = x[ib].d; |  | ||||||
|         }); |  | ||||||
|  |  | ||||||
|     sycl::free(tmp_buf, *stream); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { |  | ||||||
|     char*data_device = (char*)src0->data; |  | ||||||
|     size_t ncols = src0->ne[0]; |  | ||||||
|     size_t nrows = src0->ne[1]; |  | ||||||
|     size_t size = ggml_nbytes(src0); |  | ||||||
|  |  | ||||||
|     reorder_qw(data_device, ncols, nrows, size, 0, stream); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { |  | ||||||
|     ggml_tensor *src0 = dst->src[0]; |  | ||||||
|     ggml_tensor *src1 = dst->src[1]; |  | ||||||
|  |  | ||||||
|     if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 && |  | ||||||
|         src1->ne[2]==1 && src1->ne[3]==1) { |  | ||||||
|         reorder_qw(src0, stream); |  | ||||||
|         ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; |  | ||||||
|         GGML_ASSERT(extra); |  | ||||||
|         extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { |  | ||||||
|     dpct::queue_ptr stream = ctx->stream(); |  | ||||||
|     if (ctx->optimized_graph) { |  | ||||||
|        return; |  | ||||||
|     } |  | ||||||
|     ctx->optimized_graph = true; |  | ||||||
|  |  | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |  | ||||||
|         if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { | static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { | ||||||
|     ggml_sycl_set_main_device(sycl_ctx->device); |     ggml_sycl_set_main_device(sycl_ctx->device); | ||||||
|     if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); |  | ||||||
|  |  | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|         ggml_tensor * node = cgraph->nodes[i]; |         ggml_tensor * node = cgraph->nodes[i]; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Neo Zhang Jianyu
					Neo Zhang Jianyu