mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Add AVX2 support for x86 architectures thanks to @Const-me !
This commit is contained in:
		| @@ -17,6 +17,7 @@ The main goal is to run the model using 4-bit quantization on a MacBook. | |||||||
|  |  | ||||||
| - Plain C/C++ implementation without dependencies | - Plain C/C++ implementation without dependencies | ||||||
| - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework | - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework | ||||||
|  | - AVX2 support for x86 architectures | ||||||
| - Mixed F16 / F32 precision | - Mixed F16 / F32 precision | ||||||
| - 4-bit quantization support | - 4-bit quantization support | ||||||
| - Runs on the CPU | - Runs on the CPU | ||||||
| @@ -185,9 +186,6 @@ When running the larger models, make sure you have enough disk space to store al | |||||||
|   In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that |   In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that | ||||||
| - I don't know yet how much the quantization affects the quality of the generated text | - I don't know yet how much the quantization affects the quality of the generated text | ||||||
| - Probably the token sampling can be improved | - Probably the token sampling can be improved | ||||||
| - x86 quantization support [not yet ready](https://github.com/ggerganov/ggml/pull/27). Basically, you want to run this |  | ||||||
|   on Apple Silicon. For now, on Linux and Windows you can use the F16 `ggml-model-f16.bin` model, but it will be much |  | ||||||
|   slower. |  | ||||||
| - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder, | - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder, | ||||||
|   there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simlpy don't |   there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simlpy don't | ||||||
|   know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the |   know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the | ||||||
|   | |||||||
							
								
								
									
										165
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										165
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -359,6 +359,45 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); | |||||||
|  |  | ||||||
| #define QK 32 | #define QK 32 | ||||||
|  |  | ||||||
|  | // AVX routines provided by GH user Const-me | ||||||
|  | // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 | ||||||
|  | #if __AVX2__ | ||||||
|  | // Unpack 32 4-bit fields into 32 bytes | ||||||
|  | // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | ||||||
|  | inline __m256i bytesFromNibbles( const uint8_t* rsi ) | ||||||
|  | { | ||||||
|  |     // Load 16 bytes from memory | ||||||
|  |     __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); | ||||||
|  |  | ||||||
|  |     // Expand bytes into uint16_t values | ||||||
|  |     __m256i bytes = _mm256_cvtepu8_epi16( tmp ); | ||||||
|  |  | ||||||
|  |     // Unpack values into individual bytes | ||||||
|  |     const __m256i lowMask = _mm256_set1_epi8( 0xF ); | ||||||
|  |     __m256i high = _mm256_andnot_si256( lowMask, bytes ); | ||||||
|  |     __m256i low = _mm256_and_si256( lowMask, bytes ); | ||||||
|  |     high = _mm256_slli_epi16( high, 4 ); | ||||||
|  |     bytes = _mm256_or_si256( low, high ); | ||||||
|  |     return bytes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | inline __m128i packNibbles( __m256i bytes ) | ||||||
|  | { | ||||||
|  |     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh | ||||||
|  |     const __m256i lowByte = _mm256_set1_epi16( 0xFF ); | ||||||
|  |     __m256i high = _mm256_andnot_si256( lowByte, bytes ); | ||||||
|  |     __m256i low = _mm256_and_si256( lowByte, bytes ); | ||||||
|  |     high = _mm256_srli_epi16( high, 4 ); | ||||||
|  |     bytes = _mm256_or_si256( low, high ); | ||||||
|  |  | ||||||
|  |     // Compress uint16_t lanes into bytes | ||||||
|  |     __m128i r0 = _mm256_castsi256_si128( bytes ); | ||||||
|  |     __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); | ||||||
|  |     return _mm_packus_epi16( r0, r1 ); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| // method 5 | // method 5 | ||||||
| // blocks of QK elements | // blocks of QK elements | ||||||
| // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) | // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) | ||||||
| @@ -414,6 +453,77 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||||||
| #else | #else | ||||||
| #error "not implemented for QK" | #error "not implemented for QK" | ||||||
| #endif | #endif | ||||||
|  | #elif defined(__AVX2__) | ||||||
|  | #if QK == 32 | ||||||
|  |     for (int i = 0; i < nb; i++) { | ||||||
|  |         // Load elements into 4 AVX vectors | ||||||
|  |         __m256 v0 = _mm256_loadu_ps( x ); | ||||||
|  |         __m256 v1 = _mm256_loadu_ps( x + 8 ); | ||||||
|  |         __m256 v2 = _mm256_loadu_ps( x + 16 ); | ||||||
|  |         __m256 v3 = _mm256_loadu_ps( x + 24 ); | ||||||
|  |         x += 32; | ||||||
|  |  | ||||||
|  |         // Compute max(abs(e)) for the block | ||||||
|  |         const __m256 signBit = _mm256_set1_ps( -0.0f ); | ||||||
|  |         __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); | ||||||
|  |         maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); | ||||||
|  |         maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); | ||||||
|  |         maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); | ||||||
|  |  | ||||||
|  |         __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); | ||||||
|  |         max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); | ||||||
|  |         max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); | ||||||
|  |         const float maxScalar = _mm_cvtss_f32( max4 ); | ||||||
|  |  | ||||||
|  |         // Quantize these floats | ||||||
|  |         const float d = maxScalar / 7.0f; | ||||||
|  |         *(float *)pd = d; | ||||||
|  |         pd += bs; | ||||||
|  |         const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f; | ||||||
|  |         const __m256 mul = _mm256_set1_ps( id ); | ||||||
|  |  | ||||||
|  |         // Apply the multiplier | ||||||
|  |         v0 = _mm256_mul_ps( v0, mul ); | ||||||
|  |         v1 = _mm256_mul_ps( v1, mul ); | ||||||
|  |         v2 = _mm256_mul_ps( v2, mul ); | ||||||
|  |         v3 = _mm256_mul_ps( v3, mul ); | ||||||
|  |  | ||||||
|  |         // Round to nearest integer | ||||||
|  |         v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); | ||||||
|  |         v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); | ||||||
|  |         v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); | ||||||
|  |         v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); | ||||||
|  |  | ||||||
|  |         // Convert floats to integers | ||||||
|  |         __m256i i0 = _mm256_cvtps_epi32( v0 ); | ||||||
|  |         __m256i i1 = _mm256_cvtps_epi32( v1 ); | ||||||
|  |         __m256i i2 = _mm256_cvtps_epi32( v2 ); | ||||||
|  |         __m256i i3 = _mm256_cvtps_epi32( v3 ); | ||||||
|  |  | ||||||
|  |         // Convert int32 to int16 | ||||||
|  |         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15 | ||||||
|  |         i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31 | ||||||
|  |                                             // Convert int16 to int8 | ||||||
|  |         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 | ||||||
|  |  | ||||||
|  |         // We got our precious signed bytes, but the order is now wrong | ||||||
|  |         // These AVX2 pack instructions process 16-byte pieces independently | ||||||
|  |         // The following instruction is fixing the order | ||||||
|  |         const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); | ||||||
|  |         i0 = _mm256_permutevar8x32_epi32( i0, perm ); | ||||||
|  |  | ||||||
|  |         // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ] | ||||||
|  |         const __m256i off = _mm256_set1_epi8( 8 ); | ||||||
|  |         i0 = _mm256_add_epi8( i0, off ); | ||||||
|  |  | ||||||
|  |         // Compress the vector into 4 bit/value, and store | ||||||
|  |         __m128i res = packNibbles( i0 ); | ||||||
|  |         _mm_storeu_si128( ( __m128i* )pb, res ); | ||||||
|  |         pb += bs; | ||||||
|  |     } | ||||||
|  | #else | ||||||
|  | #error "not implemented for QK" | ||||||
|  | #endif | ||||||
| #elif defined(__wasm_simd128__) | #elif defined(__wasm_simd128__) | ||||||
| #if QK == 32 | #if QK == 32 | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
| @@ -1285,6 +1395,61 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void | |||||||
| #else | #else | ||||||
| #error "not implemented for QK" | #error "not implemented for QK" | ||||||
| #endif | #endif | ||||||
|  | #elif defined(__AVX2__) | ||||||
|  | #if QK == 32 | ||||||
|  |     const size_t countBlocks = nb; | ||||||
|  |  | ||||||
|  |     // Initialize accumulator with zeros | ||||||
|  |     __m256 acc = _mm256_setzero_ps(); | ||||||
|  |  | ||||||
|  |     // Main loop | ||||||
|  |     for (int i = 0; i < nb; ++i) { | ||||||
|  |         const float * d0_0 = (const float *) (pd0 + i*bs); | ||||||
|  |         const float * d1_0 = (const float *) (pd1 + i*bs); | ||||||
|  |  | ||||||
|  |         const uint8_t * restrict p0 = pb0 + i*bs; | ||||||
|  |         const uint8_t * restrict p1 = pb1 + i*bs; | ||||||
|  |  | ||||||
|  |         // Compute combined scale for the block | ||||||
|  |         const __m256 scale = _mm256_mul_ps( _mm256_broadcast_ss( d0_0 ), _mm256_broadcast_ss( d1_0 ) ); | ||||||
|  |  | ||||||
|  |         // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes | ||||||
|  |         __m256i bx = bytesFromNibbles( p0 ); | ||||||
|  |         __m256i by = bytesFromNibbles( p1 ); | ||||||
|  |  | ||||||
|  |         // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. | ||||||
|  |         const __m256i off = _mm256_set1_epi8( 8 ); | ||||||
|  |         bx = _mm256_sub_epi8( bx, off ); | ||||||
|  |         by = _mm256_sub_epi8( by, off ); | ||||||
|  |  | ||||||
|  |         // Sign-extend first 16 signed bytes into int16_t | ||||||
|  |         __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); | ||||||
|  |         __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); | ||||||
|  |         // Compute products of int16_t integers, add pairwise | ||||||
|  |         __m256i i32 = _mm256_madd_epi16( x16, y16 ); | ||||||
|  |  | ||||||
|  |         // Sign-extend last 16 signed bytes into int16_t vectors | ||||||
|  |         x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); | ||||||
|  |         y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); | ||||||
|  |         // Accumulate products of int16_t integers | ||||||
|  |         i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) ); | ||||||
|  |  | ||||||
|  |         // Convert int32_t to float | ||||||
|  |         __m256 p = _mm256_cvtepi32_ps( i32 ); | ||||||
|  |         // Apply the scale, and accumulate | ||||||
|  |         acc = _mm256_fmadd_ps( scale, p, acc ); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Return horizontal sum of the acc vector | ||||||
|  |     __m128 res = _mm256_extractf128_ps( acc, 1 ); | ||||||
|  |     res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); | ||||||
|  |     res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); | ||||||
|  |     res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); | ||||||
|  |  | ||||||
|  |     sumf = _mm_cvtss_f32( res ); | ||||||
|  | #else | ||||||
|  | #error "not implemented for QK" | ||||||
|  | #endif | ||||||
| #elif defined(__wasm_simd128__) | #elif defined(__wasm_simd128__) | ||||||
| #if QK == 32 | #if QK == 32 | ||||||
|     // wasm simd |     // wasm simd | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov