mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : same IQ4_NL quantization for CPU/CUDA/Metal (#6196)
* Make quantize_row_iq4_nl do the same thing is quantization on CUDA * Make quantize_row_iq4_nl do the same thing is quantization on CUDA This time for real. backend-ops tests pass. * Now fix test-quantize-fns --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
		| @@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | |||||||
|         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, |         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, | ||||||
|         float * scales, float * weight, uint8_t * L, |         float * scales, float * weight, uint8_t * L, | ||||||
|         const int8_t * values, |         const int8_t * values, | ||||||
|         const float * quant_weights) { |         const float * quant_weights, | ||||||
|  |         const int ntry) { | ||||||
|     const int ntry = 7; |  | ||||||
|  |  | ||||||
|     float sigma2 = 0; |     float sigma2 = 0; | ||||||
|     for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; |     for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; | ||||||
| @@ -11719,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | |||||||
|     float max_scale = 0, amax_scale = 0; |     float max_scale = 0, amax_scale = 0; | ||||||
|     for (int ib = 0; ib < super_block_size/block_size; ++ib) { |     for (int ib = 0; ib < super_block_size/block_size; ++ib) { | ||||||
|         const float * xb = x + ib*block_size; |         const float * xb = x + ib*block_size; | ||||||
|  |         uint8_t * Lb = L + ib*block_size; | ||||||
|         if (quant_weights) { |         if (quant_weights) { | ||||||
|             const float * qw = quant_weights + ib*block_size; |             const float * qw = quant_weights + ib*block_size; | ||||||
|             for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); |             for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); | ||||||
| @@ -11736,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | |||||||
|             scales[ib] = 0; |             scales[ib] = 0; | ||||||
|             continue; |             continue; | ||||||
|         } |         } | ||||||
|         float d = -max/values[0]; |         float d = ntry > 0 ? -max/values[0] : max/values[0]; | ||||||
|         float id = 1/d; |         float id = 1/d; | ||||||
|         float sumqx = 0, sumq2 = 0; |         float sumqx = 0, sumq2 = 0; | ||||||
|         for (int j = 0; j < block_size; ++j) { |         for (int j = 0; j < block_size; ++j) { | ||||||
|             float al = id*xb[j]; |             float al = id*xb[j]; | ||||||
|             int l = best_index_int8(16, values, al); |             int l = best_index_int8(16, values, al); | ||||||
|  |             Lb[j] = l; | ||||||
|             float q = values[l]; |             float q = values[l]; | ||||||
|             float w = weight[j]; |             float w = weight[j]; | ||||||
|             sumqx += w*q*xb[j]; |             sumqx += w*q*xb[j]; | ||||||
| @@ -11796,9 +11797,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | |||||||
|         } |         } | ||||||
|     } else { |     } else { | ||||||
|         dh[0] = GGML_FP32_TO_FP16(scales[0]); |         dh[0] = GGML_FP32_TO_FP16(scales[0]); | ||||||
|         float id = scales[0] ? 1/scales[0] : 0; |         if (ntry > 0) { | ||||||
|         for (int j = 0; j < super_block_size; ++j) { |             float id = scales[0] ? 1/scales[0] : 0; | ||||||
|             L[j] = best_index_int8(16, values, id*x[j]); |             for (int j = 0; j < super_block_size; ++j) { | ||||||
|  |                 L[j] = best_index_int8(16, values, id*x[j]); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -11823,7 +11826,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow | |||||||
|         for (int ibl = 0; ibl < nblock; ++ibl) { |         for (int ibl = 0; ibl < nblock; ++ibl) { | ||||||
|             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; |             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; | ||||||
|             quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, |             quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, | ||||||
|                     &scale, weight, L, kvalues_iq4nl, qw); |                     &scale, weight, L, kvalues_iq4nl, qw, 7); | ||||||
|         } |         } | ||||||
|         src += n_per_row; |         src += n_per_row; | ||||||
|         qrow += nblock*sizeof(block_iq4_nl); |         qrow += nblock*sizeof(block_iq4_nl); | ||||||
| @@ -11832,14 +11835,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow | |||||||
| } | } | ||||||
|  |  | ||||||
| void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { | void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { | ||||||
|     assert(k % QK4_NL == 0); |     GGML_ASSERT(k%QK4_NL == 0); | ||||||
|     block_iq4_nl * restrict y = vy; |     int nblock = k/QK4_NL; | ||||||
|     quantize_row_iq4_nl_reference(x, y, k); |     uint8_t L[QK4_NL]; | ||||||
|  |     float weight[QK4_NL]; | ||||||
|  |     uint16_t unused_h; | ||||||
|  |     uint8_t * unused_l = NULL; | ||||||
|  |     float scale; | ||||||
|  |     block_iq4_nl * iq4 = (block_iq4_nl *)vy; | ||||||
|  |     for (int ibl = 0; ibl < nblock; ++ibl) { | ||||||
|  |         quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, | ||||||
|  |                 &scale, weight, L, kvalues_iq4nl, NULL, -1); | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { | void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { | ||||||
|     assert(k % QK4_NL == 0); |     assert(k % QK4_NL == 0); | ||||||
|     quantize_iq4_nl(x, y, 1, k, NULL); |     quantize_row_iq4_nl(x, y, k); | ||||||
| } | } | ||||||
|  |  | ||||||
| size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { | size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { | ||||||
| @@ -11857,7 +11869,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow | |||||||
|         for (int ibl = 0; ibl < nblock; ++ibl) { |         for (int ibl = 0; ibl < nblock; ++ibl) { | ||||||
|             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; |             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; | ||||||
|             quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, |             quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, | ||||||
|                     scales, weight, L, kvalues_iq4nl, qw); |                     scales, weight, L, kvalues_iq4nl, qw, 7); | ||||||
|         } |         } | ||||||
|         src += n_per_row; |         src += n_per_row; | ||||||
|         qrow += nblock*sizeof(block_iq4_xs); |         qrow += nblock*sizeof(block_iq4_xs); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow