mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : support AVX512VNNI (#6280)
This change causes some quants (e.g. Q4_0, Q8_0) to go faster on some architectures (e.g. AMD Zen 4).
This commit is contained in:
		| @@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) { | |||||||
| } | } | ||||||
|  |  | ||||||
| static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { | static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { | ||||||
| #if __AVXVNNI__ | #if defined(__AVXVNNI__) || defined(__AVX512VNNI__) | ||||||
|     const __m256i zero = _mm256_setzero_si256(); |     const __m256i zero = _mm256_setzero_si256(); | ||||||
|     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); |     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); | ||||||
|     return _mm256_cvtepi32_ps(summed_pairs); |     return _mm256_cvtepi32_ps(summed_pairs); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Justine Tunney
					Justine Tunney