mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Use full range for q4_0 quantization
By keeping the sign of the highest magnitude, we can make sure the highest value maps to -8, which is currently unused. This is a bit of a freebie since it is fully backwards compatible with the current format. quantize-stats output: before(7B): q4_0 : mse 0.00000492, maxerr 0.14257812 after(7B): q4_0 : mse 0.00000386, maxerr 0.18200684 (Most layers have reduced maxerr under this rule, but the total max error is indeed slightly higher)
This commit is contained in:
		 Håkon H. Hitland
					Håkon H. Hitland
				
			
				
					committed by
					
						 Georgi Gerganov
						Georgi Gerganov
					
				
			
			
				
	
			
			
			 Georgi Gerganov
						Georgi Gerganov
					
				
			
						parent
						
							0e018fe008
						
					
				
				
					commit
					3698f79e6a
				
			
							
								
								
									
										12
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -680,13 +680,17 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r | ||||
|  | ||||
|     for (int i = 0; i < nb; i++) { | ||||
|         float amax = 0.0f; // absolute max | ||||
|         float max = 0.0f; | ||||
|  | ||||
|         for (int l = 0; l < QK4_0; l++) { | ||||
|             const float v = x[i*QK4_0 + l]; | ||||
|             amax = MAX(amax, fabsf(v)); | ||||
|             if (amax < fabsf(v)) { | ||||
|                 amax = fabsf(v); | ||||
|                 max = v; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         const float d = amax / ((1 << 3) - 1); | ||||
|         const float d = max / -8; | ||||
|         const float id = d ? 1.0f/d : 0.0f; | ||||
|  | ||||
|         y[i].d = d; | ||||
| @@ -695,8 +699,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r | ||||
|             const float v0 = x[i*QK4_0 + l + 0]*id; | ||||
|             const float v1 = x[i*QK4_0 + l + 1]*id; | ||||
|  | ||||
|             const uint8_t vi0 = (int8_t)roundf(v0) + 8; | ||||
|             const uint8_t vi1 = (int8_t)roundf(v1) + 8; | ||||
|             const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8); | ||||
|             const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8); | ||||
|  | ||||
|             assert(vi0 < 16); | ||||
|             assert(vi1 < 16); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user