mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	ggml : workaround for missing _mm256_setr_m128i in GCC < 8 (#1638)
This commit is contained in:
		
				
					committed by
					
						
						GitHub
					
				
			
			
				
	
			
			
			
						parent
						
							555275a693
						
					
				
				
					commit
					ef3171d162
				
			
							
								
								
									
										16
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								ggml.c
									
									
									
									
									
								
							@@ -492,6 +492,8 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 | 
				
			|||||||
// quantization
 | 
					// quantization
 | 
				
			||||||
//
 | 
					//
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
 | 
					#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
 | 
				
			||||||
// multiply int8_t, add results pairwise twice
 | 
					// multiply int8_t, add results pairwise twice
 | 
				
			||||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
 | 
					static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
 | 
				
			||||||
@@ -551,7 +553,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
 | 
				
			|||||||
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 | 
					static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
 | 
					    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
 | 
				
			||||||
    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
 | 
					    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
 | 
				
			||||||
    const __m256i lowMask = _mm256_set1_epi8( 0xF );
 | 
					    const __m256i lowMask = _mm256_set1_epi8( 0xF );
 | 
				
			||||||
    return _mm256_and_si256(lowMask, bytes);
 | 
					    return _mm256_and_si256(lowMask, bytes);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -624,7 +626,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
 | 
				
			|||||||
    bytesh = _mm_or_si128(bytesh, bit_mask);
 | 
					    bytesh = _mm_or_si128(bytesh, bit_mask);
 | 
				
			||||||
    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
 | 
					    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
 | 
				
			||||||
    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
 | 
					    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
 | 
				
			||||||
    return _mm256_set_m128i(bytesh, bytesl);
 | 
					    return MM256_SET_M128I(bytesh, bytesl);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Unpack 32 4-bit fields into 32 bytes
 | 
					// Unpack 32 4-bit fields into 32 bytes
 | 
				
			||||||
@@ -637,7 +639,7 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 | 
				
			|||||||
    const __m128i lowMask = _mm_set1_epi8(0xF);
 | 
					    const __m128i lowMask = _mm_set1_epi8(0xF);
 | 
				
			||||||
    tmpl = _mm_and_si128(lowMask, tmpl);
 | 
					    tmpl = _mm_and_si128(lowMask, tmpl);
 | 
				
			||||||
    tmph = _mm_and_si128(lowMask, tmph);
 | 
					    tmph = _mm_and_si128(lowMask, tmph);
 | 
				
			||||||
    return _mm256_set_m128i(tmph, tmpl);
 | 
					    return MM256_SET_M128I(tmph, tmpl);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// add int16_t pairwise and return as float vector
 | 
					// add int16_t pairwise and return as float vector
 | 
				
			||||||
@@ -645,7 +647,7 @@ static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
 | 
				
			|||||||
    const __m128i ones = _mm_set1_epi16(1);
 | 
					    const __m128i ones = _mm_set1_epi16(1);
 | 
				
			||||||
    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
 | 
					    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
 | 
				
			||||||
    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
 | 
					    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
 | 
				
			||||||
    const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
 | 
					    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
 | 
				
			||||||
    return _mm256_cvtepi32_ps(summed_pairs);
 | 
					    return _mm256_cvtepi32_ps(summed_pairs);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -2350,7 +2352,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 | 
				
			|||||||
        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
 | 
					        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // Convert int32_t to float
 | 
					        // Convert int32_t to float
 | 
				
			||||||
        __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
 | 
					        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // Apply the scale, and accumulate
 | 
					        // Apply the scale, and accumulate
 | 
				
			||||||
        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
 | 
					        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
 | 
				
			||||||
@@ -2826,7 +2828,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
 | 
				
			|||||||
        __m128i bxh = _mm256_extractf128_si256(bx, 1);
 | 
					        __m128i bxh = _mm256_extractf128_si256(bx, 1);
 | 
				
			||||||
        bxl = _mm_or_si128(bxl, bxhil);
 | 
					        bxl = _mm_or_si128(bxl, bxhil);
 | 
				
			||||||
        bxh = _mm_or_si128(bxh, bxhih);
 | 
					        bxh = _mm_or_si128(bxh, bxhih);
 | 
				
			||||||
        bx = _mm256_set_m128i(bxh, bxl);
 | 
					        bx = MM256_SET_M128I(bxh, bxl);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 | 
					        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -3082,7 +3084,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 | 
				
			|||||||
        __m128i bxh = _mm256_extractf128_si256(bx, 1);
 | 
					        __m128i bxh = _mm256_extractf128_si256(bx, 1);
 | 
				
			||||||
        bxl = _mm_or_si128(bxl, bxhil);
 | 
					        bxl = _mm_or_si128(bxl, bxhil);
 | 
				
			||||||
        bxh = _mm_or_si128(bxh, bxhih);
 | 
					        bxh = _mm_or_si128(bxh, bxhih);
 | 
				
			||||||
        bx = _mm256_set_m128i(bxh, bxl);
 | 
					        bx = MM256_SET_M128I(bxh, bxl);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        const __m256 dy = _mm256_set1_ps(y[i].d);
 | 
					        const __m256 dy = _mm256_set1_ps(y[i].d);
 | 
				
			||||||
        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 | 
					        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user