CANN: Support Ascend310P to accelerate F32 and F16 Model (#10216)

* CANN Support Ascend310P to accelerate F32 and F16 Model

* Add compile option soc type macro ASCEND_310P to ggml-cann lib

* Remove unused code

* Remove the ascend soc_type hard code compile option in CMakelist.txt
This commit is contained in:
leo-pony
2024-11-22 14:07:20 +08:00
committed by GitHub
parent a5e47592b6
commit c18610b4ee
7 changed files with 123 additions and 41 deletions

View File

@@ -13,7 +13,7 @@ class GET_ROW_F32 {
int64_t *indices_ne_ub, size_t *indices_nb_ub,
int64_t *output_ne_ub, size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPadExtParams<float> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
len += elem_per_block;
}
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};
template <typename T>