mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : same IQ4_NL quantization for CPU/CUDA/Metal (#6196)
* Make quantize_row_iq4_nl do the same thing is quantization on CUDA * Make quantize_row_iq4_nl do the same thing is quantization on CUDA This time for real. backend-ops tests pass. * Now fix test-quantize-fns --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
		| @@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | ||||
|         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, | ||||
|         float * scales, float * weight, uint8_t * L, | ||||
|         const int8_t * values, | ||||
|         const float * quant_weights) { | ||||
|  | ||||
|     const int ntry = 7; | ||||
|         const float * quant_weights, | ||||
|         const int ntry) { | ||||
|  | ||||
|     float sigma2 = 0; | ||||
|     for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; | ||||
| @@ -11719,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | ||||
|     float max_scale = 0, amax_scale = 0; | ||||
|     for (int ib = 0; ib < super_block_size/block_size; ++ib) { | ||||
|         const float * xb = x + ib*block_size; | ||||
|         uint8_t * Lb = L + ib*block_size; | ||||
|         if (quant_weights) { | ||||
|             const float * qw = quant_weights + ib*block_size; | ||||
|             for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); | ||||
| @@ -11736,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | ||||
|             scales[ib] = 0; | ||||
|             continue; | ||||
|         } | ||||
|         float d = -max/values[0]; | ||||
|         float d = ntry > 0 ? -max/values[0] : max/values[0]; | ||||
|         float id = 1/d; | ||||
|         float sumqx = 0, sumq2 = 0; | ||||
|         for (int j = 0; j < block_size; ++j) { | ||||
|             float al = id*xb[j]; | ||||
|             int l = best_index_int8(16, values, al); | ||||
|             Lb[j] = l; | ||||
|             float q = values[l]; | ||||
|             float w = weight[j]; | ||||
|             sumqx += w*q*xb[j]; | ||||
| @@ -11796,9 +11797,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block | ||||
|         } | ||||
|     } else { | ||||
|         dh[0] = GGML_FP32_TO_FP16(scales[0]); | ||||
|         float id = scales[0] ? 1/scales[0] : 0; | ||||
|         for (int j = 0; j < super_block_size; ++j) { | ||||
|             L[j] = best_index_int8(16, values, id*x[j]); | ||||
|         if (ntry > 0) { | ||||
|             float id = scales[0] ? 1/scales[0] : 0; | ||||
|             for (int j = 0; j < super_block_size; ++j) { | ||||
|                 L[j] = best_index_int8(16, values, id*x[j]); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -11823,7 +11826,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow | ||||
|         for (int ibl = 0; ibl < nblock; ++ibl) { | ||||
|             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; | ||||
|             quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, | ||||
|                     &scale, weight, L, kvalues_iq4nl, qw); | ||||
|                     &scale, weight, L, kvalues_iq4nl, qw, 7); | ||||
|         } | ||||
|         src += n_per_row; | ||||
|         qrow += nblock*sizeof(block_iq4_nl); | ||||
| @@ -11832,14 +11835,23 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow | ||||
| } | ||||
|  | ||||
| void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { | ||||
|     assert(k % QK4_NL == 0); | ||||
|     block_iq4_nl * restrict y = vy; | ||||
|     quantize_row_iq4_nl_reference(x, y, k); | ||||
|     GGML_ASSERT(k%QK4_NL == 0); | ||||
|     int nblock = k/QK4_NL; | ||||
|     uint8_t L[QK4_NL]; | ||||
|     float weight[QK4_NL]; | ||||
|     uint16_t unused_h; | ||||
|     uint8_t * unused_l = NULL; | ||||
|     float scale; | ||||
|     block_iq4_nl * iq4 = (block_iq4_nl *)vy; | ||||
|     for (int ibl = 0; ibl < nblock; ++ibl) { | ||||
|         quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, | ||||
|                 &scale, weight, L, kvalues_iq4nl, NULL, -1); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { | ||||
|     assert(k % QK4_NL == 0); | ||||
|     quantize_iq4_nl(x, y, 1, k, NULL); | ||||
|     quantize_row_iq4_nl(x, y, k); | ||||
| } | ||||
|  | ||||
| size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { | ||||
| @@ -11857,7 +11869,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow | ||||
|         for (int ibl = 0; ibl < nblock; ++ibl) { | ||||
|             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; | ||||
|             quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, | ||||
|                     scales, weight, L, kvalues_iq4nl, qw); | ||||
|                     scales, weight, L, kvalues_iq4nl, qw, 7); | ||||
|         } | ||||
|         src += n_per_row; | ||||
|         qrow += nblock*sizeof(block_iq4_xs); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow