mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	CANN: Optimize MUL_MAT_ID (#15658)
This commit is contained in:
		| @@ -2867,174 +2867,49 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){ | |||||||
|  */ |  */ | ||||||
| static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { | static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { | ||||||
|     //dst   [M, K, N, 1] |     //dst   [M, K, N, 1] | ||||||
|     ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1] |     ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]  -> [D, M, K, 1] | ||||||
|     ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1 |     ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1] | ||||||
|     ggml_tensor * ids  = dst->src[2];  //ids	[K, N] |     ggml_tensor * ids  = dst->src[2];  //ids	[K, N] | ||||||
|  |  | ||||||
|     GGML_TENSOR_BINARY_OP_LOCALS |     GGML_ASSERT(src0->ne[3] == 1); | ||||||
|  |     GGML_ASSERT(src1->ne[3] == 1); | ||||||
|  |     GGML_ASSERT(dst->ne[3] == 1); | ||||||
|  |  | ||||||
|     // copy index from npu to cpu |     int64_t batch = src1->ne[2]; | ||||||
|     int64_t n_as = ne02; // A |     GGML_ASSERT(batch == ids->ne[1]); | ||||||
|     int64_t n_ids = ids->ne[0]; // K |  | ||||||
|  |  | ||||||
|     std::vector<char> ids_host(ggml_nbytes(ids)); |     ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0)); | ||||||
|     ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids), |     void* export_ptr = export_allocator.get(); | ||||||
|         ACL_MEMCPY_DEVICE_TO_HOST); |     for (int64_t i = 0; i < batch; i++) { | ||||||
|     ACL_CHECK(aclrtSynchronizeStream(ctx.stream())); |         aclTensor *select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]); | ||||||
|  |         aclTensor *export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3); | ||||||
|  |  | ||||||
|     char * src0_original = (char *) src0->data; |         int64_t select_export_ne[] = {src0->ne[0], src0->ne[1], ids->ne[0]}; | ||||||
|     char * src1_original = (char *) src1->data; |         size_t select_export_nb[3]; | ||||||
|     char * dst_original  = (char *)  dst->data; |         select_export_nb[0] = src0->nb[0]; | ||||||
|     size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03}; |         for (int k = 1;k < 3; k++) { | ||||||
|  |             select_export_nb[k] = select_export_nb[k-1] * select_export_ne[k-1]; | ||||||
|     // src0 is F16, src1 is F32, dst is F32 |  | ||||||
|     ggml_cann_pool_alloc src0_cast_allocator; |  | ||||||
|     if (src0->type == GGML_TYPE_F16) { |  | ||||||
|         src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0)); |  | ||||||
|         void* src0_cast_buf = src0_cast_allocator.get(); |  | ||||||
|  |  | ||||||
|         size_t cast_nb[GGML_MAX_DIMS]; |  | ||||||
|         cast_nb[0] = sizeof(float_t); |  | ||||||
|         for (int i = 1; i < GGML_MAX_DIMS; i++) { |  | ||||||
|             cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1]; |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0); |         aclTensor *select_export = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_export_ne, select_export_nb, 3); | ||||||
|         aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf, |         GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight, 0, select_index, select_export); | ||||||
|             ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4); |  | ||||||
|         GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast); |  | ||||||
|         ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16); |  | ||||||
|  |  | ||||||
|         src0_original = (char *) src0_cast_buf; |         int64_t select_transpose_ne[] = {select_export_ne[1], select_export_ne[0], select_export_ne[2]}; | ||||||
|         memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb)); |         size_t select_transpose_nb[] = {select_export_nb[1], select_export_nb[0], select_export_nb[2]}; | ||||||
|  |         aclTensor *select_export_transpose = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_transpose_ne, select_transpose_nb, 3); | ||||||
|  |  | ||||||
|  |         int64_t active_tensor_ne[] = {src1->ne[0], 1, src1->ne[1]}; | ||||||
|  |         size_t active_tensor_nb[] = {src1->nb[0], src1->nb[1], src1->nb[1]}; | ||||||
|  |         aclTensor *active_tensor = ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]); | ||||||
|  |  | ||||||
|  |         int64_t dst_ne[] = {dst->ne[0], 1, dst->ne[1]}; | ||||||
|  |         size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[1]}; | ||||||
|  |         aclTensor *acl_dst = ggml_cann_create_tensor(dst, dst_ne,dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]); | ||||||
|  |  | ||||||
|  |         GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor, select_export_transpose, acl_dst, 2); | ||||||
|  |  | ||||||
|  |         ggml_cann_release_resources(ctx, select_index, export_weight, select_export, active_tensor, acl_dst, select_export_transpose); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #ifdef ASCEND_310P |  | ||||||
|     ggml_tensor src0_row = *src0; |  | ||||||
|     ggml_tensor src1_row = *src1; |  | ||||||
|     ggml_tensor dst_row = *dst; |  | ||||||
|  |  | ||||||
|     if (src0->type == GGML_TYPE_F16) { |  | ||||||
|         src0_row.type = GGML_TYPE_F32; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // src0_row [D, M, 1, 1] weight without permute |  | ||||||
|     src0_row.ne[2] = 1; |  | ||||||
|     src0_row.ne[3] = 1; |  | ||||||
|     src0_row.nb[0] = ori_src0_nb[0]; |  | ||||||
|     src0_row.nb[1] = ori_src0_nb[1]; |  | ||||||
|     src0_row.nb[2] = ori_src0_nb[1]; |  | ||||||
|     src0_row.nb[3] = ori_src0_nb[1]; |  | ||||||
|  |  | ||||||
|     // src1_row [D, 1, 1, 1] -> input |  | ||||||
|     src1_row.ne[1] = 1; |  | ||||||
|     src1_row.ne[2] = 1; |  | ||||||
|     src1_row.ne[3] = 1; |  | ||||||
|     src1_row.nb[2] = nb11; |  | ||||||
|     src1_row.nb[3] = nb11; |  | ||||||
|  |  | ||||||
|     // dst_row [M, 1, 1, 1] -> out |  | ||||||
|     dst_row.ne[1] = 1; |  | ||||||
|     dst_row.ne[2] = 1; |  | ||||||
|     dst_row.ne[3] = 1; |  | ||||||
|     dst_row.nb[2] = nb1; |  | ||||||
|     dst_row.nb[3] = nb1; |  | ||||||
|  |  | ||||||
|     //create weight for one row |  | ||||||
|     for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { |  | ||||||
|         for (int64_t id = 0; id < n_ids; id++) { |  | ||||||
|             // expert index |  | ||||||
|             int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); |  | ||||||
|             GGML_ASSERT(i02 >= 0 && i02 < n_as); |  | ||||||
|  |  | ||||||
|             // If B = 1 (broadcast), always use 0; otherwise, use id. |  | ||||||
|             int64_t i11 = (ne11 == 1 ? 0 : id); |  | ||||||
|             int64_t i12 = iid1; |  | ||||||
|  |  | ||||||
|             int64_t i1 = id; |  | ||||||
|             int64_t i2 = i12; |  | ||||||
|  |  | ||||||
|             void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2]; |  | ||||||
|             void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; |  | ||||||
|             void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2; |  | ||||||
|  |  | ||||||
|             src0_row.data = src0_tmp_ptr; |  | ||||||
|             src1_row.data = src1_tmp_ptr; |  | ||||||
|             dst_row.data = dst_tmp_ptr; |  | ||||||
|             dst_row.src[0] = &src0_row; |  | ||||||
|             dst_row.src[1] = &src1_row; |  | ||||||
|  |  | ||||||
|             ggml_cann_mul_mat(ctx, &dst_row); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|     std::vector<aclTensor*> src0_tensor_vec; |  | ||||||
|     std::vector<aclTensor*> src1_tensor_vec; |  | ||||||
|     std::vector<aclTensor*> dst_tensor_vec; |  | ||||||
|     for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { |  | ||||||
|         for (int64_t id = 0; id < n_ids; id++) { |  | ||||||
|             // src0_row [M, D] -> weight && permute |  | ||||||
|             int64_t src0_ne[2] = {ne01, ne00}; |  | ||||||
|             size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]}; |  | ||||||
|             // src1_row [D, 1] -> input |  | ||||||
|             int64_t src1_ne[2] = {ne10, 1}; |  | ||||||
|             size_t src1_nb[2] = {nb10, nb11}; |  | ||||||
|             // dst_row [M, 1] -> out |  | ||||||
|             int64_t dst_ne[2] = {ne0, 1}; |  | ||||||
|             size_t dst_nb[2] = {nb0, nb1}; |  | ||||||
|  |  | ||||||
|             // expert index |  | ||||||
|             int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); |  | ||||||
|             GGML_ASSERT(i02 >= 0 && i02 < n_as); |  | ||||||
|  |  | ||||||
|             // If B = 1 (broadcast), always use 0; otherwise, use id. |  | ||||||
|             int64_t i11 = (ne11 == 1 ? 0 : id); |  | ||||||
|             int64_t i12 = iid1; |  | ||||||
|  |  | ||||||
|             int64_t i1 = id; |  | ||||||
|             int64_t i2 = i12; |  | ||||||
|  |  | ||||||
|             void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2]; |  | ||||||
|             void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; |  | ||||||
|             void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2; |  | ||||||
|  |  | ||||||
|             aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr, |  | ||||||
|                 ACL_FLOAT, sizeof(float), |  | ||||||
|                 src0_ne, src0_nb, 2); |  | ||||||
|             aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr, |  | ||||||
|                 ACL_FLOAT, sizeof(float), |  | ||||||
|                 src1_ne, src1_nb, 2); |  | ||||||
|             aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr, |  | ||||||
|                 ACL_FLOAT, sizeof(float), |  | ||||||
|                 dst_ne, dst_nb, 2); |  | ||||||
|  |  | ||||||
|             src0_tensor_vec.push_back(acl_src0); |  | ||||||
|             src1_tensor_vec.push_back(acl_src1); |  | ||||||
|             dst_tensor_vec.push_back(acl_dst); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     size_t GROUP_SIZE = 128; |  | ||||||
|     // GroupedMatmulV3 required tensor_list.size < 128 |  | ||||||
|     for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) { |  | ||||||
|         // split and call GroupedMatmulV3 |  | ||||||
|         size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size()); |  | ||||||
|         std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end); |  | ||||||
|         std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end); |  | ||||||
|         std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end); |  | ||||||
|  |  | ||||||
|         aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size()); |  | ||||||
|         aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size()); |  | ||||||
|         aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size()); |  | ||||||
|  |  | ||||||
|         GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list, |  | ||||||
|             nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list); |  | ||||||
|  |  | ||||||
|         ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list); |  | ||||||
|     } |  | ||||||
|     return; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /** | /** | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 hipudding
					hipudding