mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml-cpu : update KleidiAI to v1.5.0 (#12568)
ggml-cpu : bug fix related to KleidiAI LHS packing Signed-off-by: Dan Johansson <dan.johansson@arm.com>
This commit is contained in:
		| @@ -359,9 +359,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) | |||||||
|  |  | ||||||
|         # Fetch KleidiAI sources: |         # Fetch KleidiAI sources: | ||||||
|         include(FetchContent) |         include(FetchContent) | ||||||
|         set(KLEIDIAI_COMMIT_TAG "v1.3.0") |         set(KLEIDIAI_COMMIT_TAG "v1.5.0") | ||||||
|         set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") |         set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") | ||||||
|         set(KLEIDIAI_ARCHIVE_MD5  "060bd2dc64642b091f461cc8dd7426d9") |         set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e") | ||||||
|  |  | ||||||
|         if (POLICY CMP0135) |         if (POLICY CMP0135) | ||||||
|             cmake_policy(SET CMP0135 NEW) |             cmake_policy(SET CMP0135 NEW) | ||||||
|   | |||||||
| @@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { | |||||||
|             /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, |             /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, | ||||||
|         }, |         }, | ||||||
|         /* .lhs_info = */ { |         /* .lhs_info = */ { | ||||||
|             /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, |             /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon, | ||||||
|             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, |             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon, | ||||||
|             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon, |             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon, | ||||||
|             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon, |             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon, | ||||||
|             /* .require_aligned_m_idx = */ true, |  | ||||||
|         }, |         }, | ||||||
|         /* .rhs_info = */ { |         /* .rhs_info = */ { | ||||||
|             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, |             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon, | ||||||
| @@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { | |||||||
|             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, |             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, |             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, |             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .require_aligned_m_idx = */ false, |  | ||||||
|         }, |         }, | ||||||
|         /* .rhs_info = */ { |         /* .rhs_info = */ { | ||||||
|             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, |             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, | ||||||
| @@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { | |||||||
|             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, |             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, |             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, |             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .require_aligned_m_idx = */ false, |  | ||||||
|         }, |         }, | ||||||
|         /* .rhs_info = */ { |         /* .rhs_info = */ { | ||||||
|             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, |             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, | ||||||
| @@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { | |||||||
|             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, |             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, |             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, |             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .require_aligned_m_idx = */ false, |  | ||||||
|         }, |         }, | ||||||
|         /* .rhs_info = */ { |         /* .rhs_info = */ { | ||||||
|             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, |             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, | ||||||
| @@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { | |||||||
|             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, |             /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, |             /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, |             /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32, | ||||||
|             /* .require_aligned_m_idx = */ false, |  | ||||||
|         }, |         }, | ||||||
|         /* .rhs_info = */ { |         /* .rhs_info = */ { | ||||||
|             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, |             /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, | ||||||
|   | |||||||
| @@ -40,7 +40,6 @@ struct lhs_packing_info { | |||||||
|     size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); |     size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); | ||||||
|     void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, |     void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, | ||||||
|                       size_t lhs_stride, void* lhs_packed); |                       size_t lhs_stride, void* lhs_packed); | ||||||
|     bool require_aligned_m_idx; |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct rhs_packing_info { | struct rhs_packing_info { | ||||||
|   | |||||||
| @@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits { | |||||||
|             size_t sr = kernel->get_sr(); |             size_t sr = kernel->get_sr(); | ||||||
|  |  | ||||||
|             // Calculate number of columns to be processed per thread |             // Calculate number of columns to be processed per thread | ||||||
|             const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true; |             const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth; | ||||||
|             const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m; |  | ||||||
|             const size_t m_start = ith * num_m_per_thread; |             const size_t m_start = ith * num_m_per_thread; | ||||||
|             size_t m_to_process = num_m_per_thread; |             size_t m_to_process = num_m_per_thread; | ||||||
|             if ((m_start + m_to_process) > m) { |             if ((m_start + m_to_process) > m) { | ||||||
| @@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits { | |||||||
|             if(m_start < m) { |             if(m_start < m) { | ||||||
|                 // Transform LHS |                 // Transform LHS | ||||||
|                 const size_t src_stride        = src1->nb[1]; |                 const size_t src_stride        = src1->nb[1]; | ||||||
|                 const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1])); |                 const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1])); | ||||||
|                 const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr); |                 const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr); | ||||||
|                 void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset); |                 void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset); | ||||||
|  |  | ||||||
|                 lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr); |                 lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             ggml_barrier(params->threadpool); |             ggml_barrier(params->threadpool); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Dan Johansson
					Dan Johansson