mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CANN: weight format to NZ for Ascend310P3 (#14407)
* weight format to nz for 310p * remove quant weight format to nz * clean code * fix * make the conditions for converting weights to NZ format consistent * clean code
This commit is contained in:
		| @@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, | ||||
|     size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], | ||||
|                              bcast_weight_nb[2], bcast_weight_nb[3], | ||||
|                              bcast_weight_nb[4], bcast_weight_nb[5]}; | ||||
|     aclTensor* acl_weight_tensor = | ||||
|         ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); | ||||
|     aclTensor* acl_weight_tensor; | ||||
|  | ||||
|     bool weightToNZ = false; | ||||
| #ifdef ASCEND_310P | ||||
|     weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); | ||||
| #endif | ||||
|     if (weightToNZ && is_matmul_weight(weight)) { | ||||
|         int64_t acl_stride[2] = {1, transpose_ne[1]}; | ||||
|  | ||||
|         // Reverse ne. | ||||
|         std::reverse(transpose_ne, transpose_ne + n_dims); | ||||
|  | ||||
|         std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]}; | ||||
|  | ||||
|         acl_weight_tensor = aclCreateTensor( | ||||
|             transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride, | ||||
|             0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data); | ||||
|     } else { | ||||
|         acl_weight_tensor = | ||||
|             ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND); | ||||
|     } | ||||
|     aclTensor* acl_dst = | ||||
|         ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); | ||||
|  | ||||
|   | ||||
| @@ -23,6 +23,7 @@ | ||||
| #ifndef CANN_ACLNN_OPS | ||||
| #define CANN_ACLNN_OPS | ||||
|  | ||||
| #include <unordered_set> | ||||
| #include <functional> | ||||
| #include <aclnnop/aclnn_abs.h> | ||||
| #include <aclnnop/aclnn_neg.h> | ||||
| @@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe | ||||
|  */ | ||||
| void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst); | ||||
|  | ||||
| /** | ||||
|  * @brief   Check whether a tensor is a weight tensor for matrix multiplication. | ||||
|  * | ||||
|  * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations, | ||||
|  *          typically within neural network layers. The function maintains a static set of canonical weight | ||||
|  *          naming suffixes from Transformer-based architectures. Uses substring matching to identify weight | ||||
|  *          tensors even with hierarchical naming patterns. | ||||
|  * | ||||
|  * @param tensor Pointer to the target ggml_tensor object (const-qualified). | ||||
|  */ | ||||
| static bool is_matmul_weight(const ggml_tensor* tensor) { | ||||
|     std::string name = ggml_get_name(tensor); | ||||
|     static const std::unordered_set<std::string> weight_suffixes{ | ||||
|         "output.weight", | ||||
|         "attn_q.weight", | ||||
|         "attn_k.weight", | ||||
|         "attn_v.weight", | ||||
|         "attn_output.weight", | ||||
|         "ffn_gate.weight", | ||||
|         "ffn_up.weight", | ||||
|         "ffn_down.weight" | ||||
|     }; | ||||
|  | ||||
|     for (const auto& suffix : weight_suffixes) { | ||||
|         if (name.find(suffix) != std::string::npos) { | ||||
|             return true; | ||||
|         } | ||||
|     } | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * @brief Applies a element-wise operation to two input tensors using the CANN | ||||
|  * backend. | ||||
|   | ||||
| @@ -24,6 +24,7 @@ | ||||
|  | ||||
| #include <acl/acl.h> | ||||
| #include <stdarg.h> | ||||
| #include <aclnnop/aclnn_trans_matmul_weight.h> | ||||
|  | ||||
| #include <cmath> | ||||
| #include <cstdio> | ||||
| @@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( | ||||
|     return GGML_STATUS_SUCCESS; | ||||
| } | ||||
|  | ||||
| static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr, | ||||
|                       aclDataType dataType, aclTensor **tensor) | ||||
| { | ||||
|     uint64_t size = 1; | ||||
|     for (auto i : shape) { | ||||
|         size *= i; | ||||
|     } | ||||
|  | ||||
|     const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size()); | ||||
|     ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size)); | ||||
|  | ||||
|     size *= sizeof(int16_t); | ||||
|  | ||||
|     ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST)); | ||||
|     aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE); | ||||
|  | ||||
|     std::vector<int64_t> strides(shape.size(), 1); | ||||
|     for (int64_t i = shape.size() - 2; i >= 0; i--) { | ||||
|         strides[i] = shape[i + 1] * strides[i + 1]; | ||||
|     } | ||||
|  | ||||
|     *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, | ||||
|                               shape.data(), shape.size(), *deviceAddr); | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) { | ||||
|     aclrtStream stream; | ||||
|     ACL_CHECK(aclrtCreateStream(&stream)); | ||||
|  | ||||
|     std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]}; | ||||
|     void *weightTransposedDeviceAddr = nullptr; | ||||
|     aclTensor *weightTransposed = nullptr; | ||||
|     CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr, | ||||
|                           ggml_cann_type_mapping(tensor->type), &weightTransposed); | ||||
|  | ||||
|     uint64_t workspaceSize = 0; | ||||
|     aclOpExecutor *executor; | ||||
|     void *workspaceAddr = nullptr; | ||||
|  | ||||
|     // TransMatmulWeight | ||||
|     ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); | ||||
|     std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree); | ||||
|     if (workspaceSize > 0) { | ||||
|         ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); | ||||
|         workspaceAddrPtrTrans.reset(workspaceAddr); | ||||
|     } | ||||
|     ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream)); | ||||
|  | ||||
|     size_t size = ggml_nelements(tensor) * ggml_element_size(tensor); | ||||
|  | ||||
|     aclrtMemcpy((char *)tensor->data + offset, size, | ||||
|                 weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE); | ||||
|     ACL_CHECK(aclDestroyTensor(weightTransposed)); | ||||
|     aclrtFree(weightTransposedDeviceAddr); | ||||
| } | ||||
|  | ||||
| // TODO: need handle tensor which has paddings. | ||||
| /** | ||||
|  * @brief Set tensor data in a CANN buffer. | ||||
| @@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor( | ||||
|     // For acl, synchronous functions use this default stream. | ||||
|     // Why aclrtSynchronizeDevice? | ||||
|  | ||||
|     bool weightToNZ = false; | ||||
| #ifdef ASCEND_310P | ||||
|     weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); | ||||
| #endif | ||||
|     if (!need_transform(tensor->type)) { | ||||
|         ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, | ||||
|                               ACL_MEMCPY_HOST_TO_DEVICE)); | ||||
|         if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) { | ||||
|             weight_format_to_nz(tensor, data, offset); | ||||
|         } | ||||
|     } else { | ||||
|         void *transform_buffer = malloc(size); | ||||
|         ggml_backend_cann_transform(tensor, data, transform_buffer); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 chen fan
					chen fan