mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CANN: Fix SOC_TYPE compile bug (#10519)
* CANN: Fix the bug build fail on Ascend310P under two cases: 1) Manual specify SOC_TYPE 2) Under some unusual compile environment * Update the cann backend News content: Support F16 and F32 data type model for Ascend 310P NPU. * fix CANN compile fail bug: the assert in ascend kernel function doesn't supportted on some CANN version
This commit is contained in:
		| @@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi | |||||||
|  |  | ||||||
| ## News | ## News | ||||||
|  |  | ||||||
|  | - 2024.11 | ||||||
|  |   - Support F16 and F32 data type model for Ascend 310P NPU. | ||||||
| - 2024.8 | - 2024.8 | ||||||
|   - Support `Q4_0` and `Q8_0` data type for Ascend NPU. |   - Support `Q4_0` and `Q8_0` data type for Ascend NPU. | ||||||
| - 2024.7 | - 2024.7 | ||||||
| @@ -43,6 +45,7 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi | |||||||
| | Ascend NPU                    | Status  | | | Ascend NPU                    | Status  | | ||||||
| |:-----------------------------:|:-------:| | |:-----------------------------:|:-------:| | ||||||
| | Atlas 300T A2                 | Support | | | Atlas 300T A2                 | Support | | ||||||
|  | | Atlas 300I Duo                | Support | | ||||||
|  |  | ||||||
| *Notes:* | *Notes:* | ||||||
|  |  | ||||||
|   | |||||||
| @@ -22,13 +22,14 @@ if(NOT SOC_TYPE) | |||||||
|     detect_ascend_soc_type(SOC_VERSION) |     detect_ascend_soc_type(SOC_VERSION) | ||||||
|     set(SOC_TYPE "${SOC_VERSION}") |     set(SOC_TYPE "${SOC_VERSION}") | ||||||
|     message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") |     message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") | ||||||
| else() |  | ||||||
|     string(TOLOWER ${SOC_TYPE} SOC_VERSION) |  | ||||||
| endif() | endif() | ||||||
|  |  | ||||||
| # Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. | string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower | ||||||
|  |  | ||||||
|  | # Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P. | ||||||
| string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") | string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") | ||||||
| set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") | set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") | ||||||
|  | string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION) | ||||||
|  |  | ||||||
| if (CANN_INSTALL_DIR) | if (CANN_INSTALL_DIR) | ||||||
|     # Only Support Linux. |     # Only Support Linux. | ||||||
|   | |||||||
| @@ -25,6 +25,6 @@ ascendc_library(ascendc_kernels STATIC | |||||||
|     ${SRC_FILES} |     ${SRC_FILES} | ||||||
| ) | ) | ||||||
|  |  | ||||||
| message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") | message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.") | ||||||
| ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") | ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") | ||||||
| # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) | # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) | ||||||
|   | |||||||
| @@ -20,7 +20,6 @@ class DupByRows { | |||||||
|         // Input has four dims. |         // Input has four dims. | ||||||
|         int64_t op_block_num = GetBlockNum(); |         int64_t op_block_num = GetBlockNum(); | ||||||
|         int64_t op_block_idx = GetBlockIdx(); |         int64_t op_block_idx = GetBlockIdx(); | ||||||
|         assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM); |  | ||||||
|  |  | ||||||
|         // param |         // param | ||||||
|         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; |         num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; | ||||||
|   | |||||||
| @@ -2,6 +2,15 @@ | |||||||
|  |  | ||||||
| // optimize me. Use template to avoid copy code. | // optimize me. Use template to avoid copy code. | ||||||
| using namespace AscendC; | using namespace AscendC; | ||||||
|  | #ifdef ASCEND_310P // 310P not support 4bit get row | ||||||
|  |     extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( | ||||||
|  |         GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, | ||||||
|  |         GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, | ||||||
|  |         GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { | ||||||
|  |         // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. | ||||||
|  |         printf("Ascend310P not support 4bit get row.\n"); | ||||||
|  |     } | ||||||
|  | #else | ||||||
|  |  | ||||||
| #define BUFFER_NUM 2 | #define BUFFER_NUM 2 | ||||||
|  |  | ||||||
| @@ -110,12 +119,9 @@ class GET_ROW_Q4_0 { | |||||||
|         LocalTensor<float> output_local = output_queue.AllocTensor<float>(); |         LocalTensor<float> output_local = output_queue.AllocTensor<float>(); | ||||||
|  |  | ||||||
|         // TODO: cast more data to speed up. |         // TODO: cast more data to speed up. | ||||||
| #ifdef ASCEND_310P |  | ||||||
|         // TODO: 310P support quantification |  | ||||||
| #else |  | ||||||
|         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); |         Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); | ||||||
|         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); |         Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); | ||||||
| #endif |  | ||||||
|         // Only mul need compile by group. |         // Only mul need compile by group. | ||||||
|         half scale = scale_gm.GetValue(scale_offset); |         half scale = scale_gm.GetValue(scale_offset); | ||||||
|  |  | ||||||
| @@ -194,3 +200,5 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( | |||||||
|             indices_nb_ub, output_ne_ub, output_nb_ub); |             indices_nb_ub, output_ne_ub, output_nb_ub); | ||||||
|     op.calculate(); |     op.calculate(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #endif // #ifdef ASCEND_310P | ||||||
|   | |||||||
| @@ -1,6 +1,14 @@ | |||||||
| #include "kernel_operator.h" | #include "kernel_operator.h" | ||||||
|  |  | ||||||
| using namespace AscendC; | using namespace AscendC; | ||||||
|  | #ifdef ASCEND_310P | ||||||
|  |     extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( | ||||||
|  |         GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, | ||||||
|  |         GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { | ||||||
|  |         // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. | ||||||
|  |         printf("Ascend310P not support f16->8bit quantization.\n"); | ||||||
|  |     } | ||||||
|  | #else | ||||||
|  |  | ||||||
| #define BUFFER_NUM 2 | #define BUFFER_NUM 2 | ||||||
| #define QK8_0 32 | #define QK8_0 32 | ||||||
| @@ -206,3 +214,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( | |||||||
|     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); |     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); | ||||||
|     op.calculate(); |     op.calculate(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #endif // #ifdef ASCEND_310P | ||||||
|   | |||||||
| @@ -1,6 +1,14 @@ | |||||||
| #include "kernel_operator.h" | #include "kernel_operator.h" | ||||||
|  |  | ||||||
| using namespace AscendC; | using namespace AscendC; | ||||||
|  | #ifdef ASCEND_310P // 310P not support f32->8bit quantization | ||||||
|  |     extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( | ||||||
|  |         GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, | ||||||
|  |         GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { | ||||||
|  |         // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. | ||||||
|  |         printf("Ascend310P not support f32->8bit quantization.\n"); | ||||||
|  |     } | ||||||
|  | #else | ||||||
|  |  | ||||||
| #define BUFFER_NUM 2 | #define BUFFER_NUM 2 | ||||||
| #define QK8_0 32 | #define QK8_0 32 | ||||||
| @@ -204,3 +212,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( | |||||||
|     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); |     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); | ||||||
|     op.calculate(); |     op.calculate(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #endif // #ifdef ASCEND_310P | ||||||
|   | |||||||
| @@ -1,6 +1,21 @@ | |||||||
| #include "kernel_operator.h" | #include "kernel_operator.h" | ||||||
|  |  | ||||||
| using namespace AscendC; | using namespace AscendC; | ||||||
|  | #ifdef ASCEND_310P // 310P not support float->4bit quantization | ||||||
|  |     extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( | ||||||
|  |         GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, | ||||||
|  |         GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { | ||||||
|  |         // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. | ||||||
|  |         printf("Ascend310P not support f32->4bit quantization.\n"); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( | ||||||
|  |         GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, | ||||||
|  |         GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { | ||||||
|  |         // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. | ||||||
|  |         printf("Ascend310P not support f16->4bit quantization.\n"); | ||||||
|  |     } | ||||||
|  | #else | ||||||
|  |  | ||||||
| #define BUFFER_NUM 2 | #define BUFFER_NUM 2 | ||||||
| #define Group_Size 32 | #define Group_Size 32 | ||||||
| @@ -276,3 +291,5 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( | |||||||
|     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); |     op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); | ||||||
|     op.calculate(); |     op.calculate(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #endif // #ifdef ASCEND_310P | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 leo-pony
					leo-pony