mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : add new Q4_2 quantization (ARM only) (#1046)
* ggml : Q4_2 ARM * ggml : add ggml_is_quantized() * llama : update llama_type_name() with Q4_2 entry * ggml : speed-up q4_2 - 4 threads: ~100ms -> ~90ms - 8 threads: ~55ms -> ~50ms * ggml : optimize q4_2 using vmlaq_n_f32 + vmulq_n_f32
This commit is contained in:
		| @@ -14,6 +14,7 @@ int main(int argc, char ** argv) { | |||||||
|         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); |         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); | ||||||
|         fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); |         fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); | ||||||
|         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); |         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); | ||||||
|  |         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										282
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										282
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -585,6 +585,13 @@ typedef struct { | |||||||
| } block_q4_1; | } block_q4_1; | ||||||
| static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); | static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); | ||||||
|  |  | ||||||
|  | #define QK4_2 16 | ||||||
|  | typedef struct { | ||||||
|  |     ggml_fp16_t d;         // delta | ||||||
|  |     uint8_t qs[QK4_2 / 2]; // nibbles / quants | ||||||
|  | } block_q4_2; | ||||||
|  | static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); | ||||||
|  |  | ||||||
| #define QK8_0 32 | #define QK8_0 32 | ||||||
| typedef struct { | typedef struct { | ||||||
|     float   d;          // delta |     float   d;          // delta | ||||||
| @@ -1045,6 +1052,49 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // reference implementation for deterministic creation of model files | ||||||
|  | static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { | ||||||
|  |     assert(k % QK4_2 == 0); | ||||||
|  |  | ||||||
|  |     const int nb = k / QK4_2; | ||||||
|  |  | ||||||
|  |     for (int i = 0; i < nb; i++) { | ||||||
|  |         float amax = 0.0f; // absolute max | ||||||
|  |  | ||||||
|  |         for (int l = 0; l < QK4_2; l++) { | ||||||
|  |             const float v = x[i*QK4_2 + l]; | ||||||
|  |             amax = MAX(amax, fabsf(v)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         const float d = amax / ((1 << 3) - 1); | ||||||
|  |  | ||||||
|  |         const float id = d ? 1.0f/d : 0.0f; | ||||||
|  |  | ||||||
|  |         y[i].d = GGML_FP32_TO_FP16(d); | ||||||
|  |  | ||||||
|  |         for (int l = 0; l < QK4_2; l += 2) { | ||||||
|  |             const float v0 = x[i*QK4_2 + l + 0]*id; | ||||||
|  |             const float v1 = x[i*QK4_2 + l + 1]*id; | ||||||
|  |  | ||||||
|  |             const uint8_t vi0 = (uint8_t)(v0 + 8.5f); | ||||||
|  |             const uint8_t vi1 = (uint8_t)(v1 + 8.5f); | ||||||
|  |  | ||||||
|  |             assert(vi0 < 16); | ||||||
|  |             assert(vi1 < 16); | ||||||
|  |  | ||||||
|  |             y[i].qs[l/2] = vi0 | (vi1 << 4); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) { | ||||||
|  |     assert(k % QK4_2 == 0); | ||||||
|  |  | ||||||
|  |     block_q4_2 * restrict y = vy; | ||||||
|  |  | ||||||
|  |     quantize_row_q4_2_reference(x, y, k); | ||||||
|  | } | ||||||
|  |  | ||||||
| // reference implementation for deterministic creation of model files | // reference implementation for deterministic creation of model files | ||||||
| static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { | static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { | ||||||
|     assert(k % QK8_0 == 0); |     assert(k % QK8_0 == 0); | ||||||
| @@ -1064,7 +1114,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r | |||||||
|         y[i].d = d; |         y[i].d = d; | ||||||
|  |  | ||||||
|         for (int l = 0; l < QK8_0; ++l) { |         for (int l = 0; l < QK8_0; ++l) { | ||||||
|             const float   v  = x[i*QK8_0 + l]*id; |             const float v = x[i*QK8_0 + l]*id; | ||||||
|             y[i].qs[l] = roundf(v); |             y[i].qs[l] = roundf(v); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -1420,8 +1470,39 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) { | ||||||
|  |     assert(k % QK4_2 == 0); | ||||||
|  |     const int nb = k / QK4_2; | ||||||
|  |  | ||||||
|  |     const block_q4_2 * restrict x = vx; | ||||||
|  |  | ||||||
|  |     for (int i = 0; i < nb; i++) { | ||||||
|  |         const float d = GGML_FP16_TO_FP32(x[i].d); | ||||||
|  |  | ||||||
|  |         const uint8_t * restrict pp = x[i].qs; | ||||||
|  |  | ||||||
|  |         for (int l = 0; l < QK4_2; l += 2) { | ||||||
|  |             const uint8_t vi = pp[l/2]; | ||||||
|  |  | ||||||
|  |             const int8_t vi0 = vi & 0xf; | ||||||
|  |             const int8_t vi1 = vi >> 4; | ||||||
|  |  | ||||||
|  |             const float v0 = (vi0 - 8)*d; | ||||||
|  |             const float v1 = (vi1 - 8)*d; | ||||||
|  |  | ||||||
|  |             y[i*QK4_2 + l + 0] = v0; | ||||||
|  |             y[i*QK4_2 + l + 1] = v1; | ||||||
|  |  | ||||||
|  |             assert(!isnan(y[i*QK4_2 + l + 0])); | ||||||
|  |             assert(!isnan(y[i*QK4_2 + l + 1])); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); | static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
| static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); | static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | //static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  |  | ||||||
| static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { | static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { | ||||||
|     [GGML_TYPE_Q4_0] = { |     [GGML_TYPE_Q4_0] = { | ||||||
| @@ -1438,6 +1519,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { | |||||||
|         .quantize_row_q_dot       = quantize_row_q4_1, |         .quantize_row_q_dot       = quantize_row_q4_1, | ||||||
|         .vec_dot_q                = ggml_vec_dot_q4_1, |         .vec_dot_q                = ggml_vec_dot_q4_1, | ||||||
|     }, |     }, | ||||||
|  |     [GGML_TYPE_Q4_2] = { | ||||||
|  |         .dequantize_row_q         = dequantize_row_q4_2, | ||||||
|  |         .quantize_row_q           = quantize_row_q4_2, | ||||||
|  |         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, | ||||||
|  |         .quantize_row_q_dot       = quantize_row_q8_0, | ||||||
|  |         .vec_dot_q                = ggml_vec_dot_q4_2_q8_0, | ||||||
|  |     }, | ||||||
|     // TODO: GGML_TYPE_Q8_0 |     // TODO: GGML_TYPE_Q8_0 | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -2950,6 +3038,136 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * | |||||||
|     *s = sumf; |     *s = sumf; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { | ||||||
|  |     const int nb = n / QK8_0; | ||||||
|  |  | ||||||
|  |     assert(n % QK8_0 == 0); | ||||||
|  |     assert(nb % 2 == 0); | ||||||
|  |     assert(QK8_0 == 2*QK4_2); | ||||||
|  |  | ||||||
|  |     const block_q4_2 * restrict x = vx; | ||||||
|  |     const block_q8_0 * restrict y = vy; | ||||||
|  |  | ||||||
|  |     float sumf = 0.0; | ||||||
|  |  | ||||||
|  | #if defined(__ARM_NEON) | ||||||
|  |     float32x4_t sumv0 = vdupq_n_f32(0.0f); | ||||||
|  |     float32x4_t sumv1 = vdupq_n_f32(0.0f); | ||||||
|  |  | ||||||
|  |     for (int i = 0; i < nb; i += 2) { | ||||||
|  |         const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0]; | ||||||
|  |         const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1]; | ||||||
|  |         const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0]; | ||||||
|  |         const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1]; | ||||||
|  |         const block_q8_0 * restrict y0 = &y[i + 0]; | ||||||
|  |         const block_q8_0 * restrict y1 = &y[i + 1]; | ||||||
|  |  | ||||||
|  |         const uint8x16_t m4b   = vdupq_n_u8(0xf); | ||||||
|  |         const int8x16_t  s8b   = vdupq_n_s8(0x8); | ||||||
|  |  | ||||||
|  |         const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); | ||||||
|  |         const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs)); | ||||||
|  |  | ||||||
|  |         // 4-bit -> 8-bit | ||||||
|  |         const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b)); | ||||||
|  |         const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); | ||||||
|  |         const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b)); | ||||||
|  |         const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); | ||||||
|  |  | ||||||
|  |         // sub 8 | ||||||
|  |         const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); | ||||||
|  |         const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); | ||||||
|  |         const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); | ||||||
|  |         const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); | ||||||
|  |  | ||||||
|  |         // interleave | ||||||
|  |         const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs); | ||||||
|  |         const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs); | ||||||
|  |         const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs); | ||||||
|  |         const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs); | ||||||
|  |  | ||||||
|  |         // load y | ||||||
|  |         const int8x16_t v1_0l = vld1q_s8(y0->qs); | ||||||
|  |         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); | ||||||
|  |         const int8x16_t v1_1l = vld1q_s8(y1->qs); | ||||||
|  |         const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); | ||||||
|  |  | ||||||
|  | #if defined(__ARM_FEATURE_DOTPROD) | ||||||
|  |         sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); | ||||||
|  |  | ||||||
|  |         sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); | ||||||
|  | #else | ||||||
|  |         const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); | ||||||
|  |         const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); | ||||||
|  |         const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); | ||||||
|  |         const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); | ||||||
|  |  | ||||||
|  |         const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); | ||||||
|  |         const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); | ||||||
|  |         const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); | ||||||
|  |         const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); | ||||||
|  |  | ||||||
|  |         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); | ||||||
|  |         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); | ||||||
|  |         const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); | ||||||
|  |         const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); | ||||||
|  |  | ||||||
|  |         sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)), | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d); | ||||||
|  |  | ||||||
|  |         sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)), | ||||||
|  |                 vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d); | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); | ||||||
|  | #else | ||||||
|  |     // scalar | ||||||
|  |     for (int i = 0; i < nb; i++) { | ||||||
|  |         const uint8_t * restrict x0 = x[2*i + 0].qs; | ||||||
|  |         const uint8_t * restrict x1 = x[2*i + 1].qs; | ||||||
|  |         const  int8_t * restrict y0 = y[i].qs; | ||||||
|  |  | ||||||
|  |         const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); | ||||||
|  |         const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); | ||||||
|  |  | ||||||
|  |         int sumi_0 = 0; | ||||||
|  |         int sumi_1 = 0; | ||||||
|  |  | ||||||
|  |         for (int j = 0; j < QK8_0/4; j++) { | ||||||
|  |             const uint8_t v0 = x0[j]; | ||||||
|  |             const uint8_t v1 = x1[j]; | ||||||
|  |  | ||||||
|  |             const int i0_0 = (int8_t) (v0 & 0xf) - 8; | ||||||
|  |             const int i1_0 = (int8_t) (v0 >> 4)  - 8; | ||||||
|  |  | ||||||
|  |             const int i0_1 = (int8_t) (v1 & 0xf) - 8; | ||||||
|  |             const int i1_1 = (int8_t) (v1 >> 4)  - 8; | ||||||
|  |  | ||||||
|  |             const int i2_0 = y0[2*j + 0]; | ||||||
|  |             const int i3_0 = y0[2*j + 1]; | ||||||
|  |  | ||||||
|  |             const int i2_1 = y0[2*(j + QK8_0/4) + 0]; | ||||||
|  |             const int i3_1 = y0[2*(j + QK8_0/4) + 1]; | ||||||
|  |  | ||||||
|  |             sumi_0 += i0_0*i2_0 + i1_0*i3_0; | ||||||
|  |             sumi_1 += i0_1*i2_1 + i1_1*i3_1; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         sumf += (d0 * y[i].d) * sumi_0; | ||||||
|  |         sumf += (d1 * y[i].d) * sumi_1; | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |     *s = sumf; | ||||||
|  | } | ||||||
|  |  | ||||||
| // compute GGML_VEC_DOT_UNROLL dot products at once | // compute GGML_VEC_DOT_UNROLL dot products at once | ||||||
| // xs - x row stride in bytes | // xs - x row stride in bytes | ||||||
| inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { | inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { | ||||||
| @@ -3196,24 +3414,26 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { | |||||||
|     [GGML_TYPE_F16]  = 1, |     [GGML_TYPE_F16]  = 1, | ||||||
|     [GGML_TYPE_Q4_0] = QK4_0, |     [GGML_TYPE_Q4_0] = QK4_0, | ||||||
|     [GGML_TYPE_Q4_1] = QK4_1, |     [GGML_TYPE_Q4_1] = QK4_1, | ||||||
|  |     [GGML_TYPE_Q4_2] = QK4_2, | ||||||
|     [GGML_TYPE_Q8_0] = QK8_0, |     [GGML_TYPE_Q8_0] = QK8_0, | ||||||
|     [GGML_TYPE_I8]   = 1, |     [GGML_TYPE_I8]   = 1, | ||||||
|     [GGML_TYPE_I16]  = 1, |     [GGML_TYPE_I16]  = 1, | ||||||
|     [GGML_TYPE_I32]  = 1, |     [GGML_TYPE_I32]  = 1, | ||||||
| }; | }; | ||||||
| static_assert(GGML_TYPE_COUNT == 8, "GGML_BLCK_SIZE is outdated"); | static_assert(GGML_TYPE_COUNT == 9, "GGML_BLCK_SIZE is outdated"); | ||||||
|  |  | ||||||
| static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { | static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { | ||||||
|     [GGML_TYPE_F32]  = sizeof(float), |     [GGML_TYPE_F32]  = sizeof(float), | ||||||
|     [GGML_TYPE_F16]  = sizeof(ggml_fp16_t), |     [GGML_TYPE_F16]  = sizeof(ggml_fp16_t), | ||||||
|     [GGML_TYPE_Q4_0] = sizeof(block_q4_0), |     [GGML_TYPE_Q4_0] = sizeof(block_q4_0), | ||||||
|     [GGML_TYPE_Q4_1] = sizeof(block_q4_1), |     [GGML_TYPE_Q4_1] = sizeof(block_q4_1), | ||||||
|  |     [GGML_TYPE_Q4_2] = sizeof(block_q4_2), | ||||||
|     [GGML_TYPE_Q8_0] = sizeof(block_q8_0), |     [GGML_TYPE_Q8_0] = sizeof(block_q8_0), | ||||||
|     [GGML_TYPE_I8]   = sizeof(int8_t), |     [GGML_TYPE_I8]   = sizeof(int8_t), | ||||||
|     [GGML_TYPE_I16]  = sizeof(int16_t), |     [GGML_TYPE_I16]  = sizeof(int16_t), | ||||||
|     [GGML_TYPE_I32]  = sizeof(int32_t), |     [GGML_TYPE_I32]  = sizeof(int32_t), | ||||||
| }; | }; | ||||||
| static_assert(GGML_TYPE_COUNT == 8, "GGML_TYPE_SIZE is outdated"); | static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_SIZE is outdated"); | ||||||
|  |  | ||||||
|  |  | ||||||
| static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { | static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { | ||||||
| @@ -3221,12 +3441,26 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { | |||||||
|     [GGML_TYPE_F16]  = "f16", |     [GGML_TYPE_F16]  = "f16", | ||||||
|     [GGML_TYPE_Q4_0] = "q4_0", |     [GGML_TYPE_Q4_0] = "q4_0", | ||||||
|     [GGML_TYPE_Q4_1] = "q4_1", |     [GGML_TYPE_Q4_1] = "q4_1", | ||||||
|  |     [GGML_TYPE_Q4_2] = "q4_2", | ||||||
|     [GGML_TYPE_Q8_0] = "q8_0", |     [GGML_TYPE_Q8_0] = "q8_0", | ||||||
|     [GGML_TYPE_I8]   = "i8", |     [GGML_TYPE_I8]   = "i8", | ||||||
|     [GGML_TYPE_I16]  = "i16", |     [GGML_TYPE_I16]  = "i16", | ||||||
|     [GGML_TYPE_I32]  = "i32", |     [GGML_TYPE_I32]  = "i32", | ||||||
| }; | }; | ||||||
| static_assert(GGML_TYPE_COUNT == 8, "GGML_TYPE_NAME is outdated"); | static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_NAME is outdated"); | ||||||
|  |  | ||||||
|  | static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { | ||||||
|  |     [GGML_TYPE_F32]  = false, | ||||||
|  |     [GGML_TYPE_F16]  = false, | ||||||
|  |     [GGML_TYPE_Q4_0] = true, | ||||||
|  |     [GGML_TYPE_Q4_1] = true, | ||||||
|  |     [GGML_TYPE_Q4_2] = true, | ||||||
|  |     [GGML_TYPE_Q8_0] = true, | ||||||
|  |     [GGML_TYPE_I8]   = false, | ||||||
|  |     [GGML_TYPE_I16]  = false, | ||||||
|  |     [GGML_TYPE_I32]  = false, | ||||||
|  | }; | ||||||
|  | static_assert(GGML_TYPE_COUNT == 9, "GGML_IS_QUANTIZED is outdated"); | ||||||
|  |  | ||||||
| static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { | static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { | ||||||
|     "NONE", |     "NONE", | ||||||
| @@ -3488,6 +3722,10 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct | |||||||
|         (t0->ne[3] == t1->ne[3]); |         (t0->ne[3] == t1->ne[3]); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static inline bool ggml_is_quantized(enum ggml_type type) { | ||||||
|  |     return GGML_IS_QUANTIZED[type]; | ||||||
|  | } | ||||||
|  |  | ||||||
| static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { | static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { | ||||||
|     return tensor->nb[0] > tensor->nb[1]; |     return tensor->nb[0] > tensor->nb[1]; | ||||||
| } | } | ||||||
| @@ -5609,7 +5847,7 @@ static void ggml_compute_forward_dup_f16( | |||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { |             } else if (ggml_is_quantized(dst->type)) { | ||||||
|                 quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; |                 quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; | ||||||
|                 size_t id = 0; |                 size_t id = 0; | ||||||
|                 uint8_t * dst_ptr = (uint8_t *) dst->data; |                 uint8_t * dst_ptr = (uint8_t *) dst->data; | ||||||
| @@ -5821,7 +6059,7 @@ static void ggml_compute_forward_dup_f32( | |||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) { |             } else if (ggml_is_quantized(dst->type)) { | ||||||
|                 quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; |                 quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; | ||||||
|                 size_t id = 0; |                 size_t id = 0; | ||||||
|                 uint8_t * dst_ptr = (uint8_t *) dst->data; |                 uint8_t * dst_ptr = (uint8_t *) dst->data; | ||||||
| @@ -6184,7 +6422,7 @@ static void ggml_compute_forward_add_q_f32( | |||||||
|     GGML_ASSERT(nb1 <= nb2); |     GGML_ASSERT(nb1 <= nb2); | ||||||
|     GGML_ASSERT(nb2 <= nb3); |     GGML_ASSERT(nb2 <= nb3); | ||||||
|  |  | ||||||
|     GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); |     GGML_ASSERT(ggml_is_quantized(src0->type)); | ||||||
|     GGML_ASSERT(dst->type == src0->type); |     GGML_ASSERT(dst->type == src0->type); | ||||||
|     GGML_ASSERT(src1->type == GGML_TYPE_F32); |     GGML_ASSERT(src1->type == GGML_TYPE_F32); | ||||||
|  |  | ||||||
| @@ -6254,6 +6492,7 @@ static void ggml_compute_forward_add( | |||||||
|             } break; |             } break; | ||||||
|         case GGML_TYPE_Q4_0: |         case GGML_TYPE_Q4_0: | ||||||
|         case GGML_TYPE_Q4_1: |         case GGML_TYPE_Q4_1: | ||||||
|  |         case GGML_TYPE_Q4_2: | ||||||
|             { |             { | ||||||
|                 ggml_compute_forward_add_q_f32(params, src0, src1, dst); |                 ggml_compute_forward_add_q_f32(params, src0, src1, dst); | ||||||
|             } break; |             } break; | ||||||
| @@ -7732,6 +7971,7 @@ static void ggml_compute_forward_mul_mat( | |||||||
|     switch (src0->type) { |     switch (src0->type) { | ||||||
|         case GGML_TYPE_Q4_0: |         case GGML_TYPE_Q4_0: | ||||||
|         case GGML_TYPE_Q4_1: |         case GGML_TYPE_Q4_1: | ||||||
|  |         case GGML_TYPE_Q4_2: | ||||||
|         case GGML_TYPE_Q8_0: |         case GGML_TYPE_Q8_0: | ||||||
|             { |             { | ||||||
|                 ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); |                 ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); | ||||||
| @@ -7987,6 +8227,7 @@ static void ggml_compute_forward_get_rows( | |||||||
|     switch (src0->type) { |     switch (src0->type) { | ||||||
|         case GGML_TYPE_Q4_0: |         case GGML_TYPE_Q4_0: | ||||||
|         case GGML_TYPE_Q4_1: |         case GGML_TYPE_Q4_1: | ||||||
|  |         case GGML_TYPE_Q4_2: | ||||||
|         case GGML_TYPE_Q8_0: |         case GGML_TYPE_Q8_0: | ||||||
|             { |             { | ||||||
|                 ggml_compute_forward_get_rows_q(params, src0, src1, dst); |                 ggml_compute_forward_get_rows_q(params, src0, src1, dst); | ||||||
| @@ -10398,7 +10639,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |||||||
|                         node->n_tasks = 1; |                         node->n_tasks = 1; | ||||||
|  |  | ||||||
|                         size_t cur = 0; |                         size_t cur = 0; | ||||||
|                         if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) { |                         if (ggml_is_quantized(node->type)) { | ||||||
|                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0]; |                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0]; | ||||||
|                         } |                         } | ||||||
|  |  | ||||||
| @@ -10410,7 +10651,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |||||||
|  |  | ||||||
|                         size_t cur = 0; |                         size_t cur = 0; | ||||||
|  |  | ||||||
|                         if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) { |                         if (ggml_is_quantized(node->src0->type)) { | ||||||
|                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; |                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; | ||||||
|                         } |                         } | ||||||
|  |  | ||||||
| @@ -11702,6 +11943,29 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * | |||||||
|     return (n/QK4_1*sizeof(block_q4_1)); |     return (n/QK4_1*sizeof(block_q4_1)); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) { | ||||||
|  |     assert(k % QK4_2 == 0); | ||||||
|  |     const int nb = k / QK4_2; | ||||||
|  |  | ||||||
|  |     for (int j = 0; j < n; j += k) { | ||||||
|  |         block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; | ||||||
|  |  | ||||||
|  |         quantize_row_q4_2_reference(src + j, y, k); | ||||||
|  |  | ||||||
|  |         for (int i = 0; i < nb; i++) { | ||||||
|  |             for (int l = 0; l < QK4_2; l += 2) { | ||||||
|  |                 const uint8_t vi0 = y[i].qs[l/2] & 0xF; | ||||||
|  |                 const uint8_t vi1 = y[i].qs[l/2] >> 4; | ||||||
|  |  | ||||||
|  |                 hist[vi0]++; | ||||||
|  |                 hist[vi1]++; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return (n/QK4_2*sizeof(block_q4_2)); | ||||||
|  | } | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| int ggml_cpu_has_avx(void) { | int ggml_cpu_has_avx(void) { | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								ggml.h
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								ggml.h
									
									
									
									
									
								
							| @@ -204,7 +204,8 @@ enum ggml_type { | |||||||
|     GGML_TYPE_F16  = 1, |     GGML_TYPE_F16  = 1, | ||||||
|     GGML_TYPE_Q4_0 = 2, |     GGML_TYPE_Q4_0 = 2, | ||||||
|     GGML_TYPE_Q4_1 = 3, |     GGML_TYPE_Q4_1 = 3, | ||||||
|     GGML_TYPE_Q8_0 = 4, |     GGML_TYPE_Q4_2 = 4, | ||||||
|  |     GGML_TYPE_Q8_0 = 5, | ||||||
|     GGML_TYPE_I8, |     GGML_TYPE_I8, | ||||||
|     GGML_TYPE_I16, |     GGML_TYPE_I16, | ||||||
|     GGML_TYPE_I32, |     GGML_TYPE_I32, | ||||||
| @@ -806,6 +807,7 @@ enum ggml_opt_result ggml_opt( | |||||||
|  |  | ||||||
| size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); | size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
| size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); | size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  | size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  |  | ||||||
| // | // | ||||||
| // system info | // system info | ||||||
|   | |||||||
							
								
								
									
										10
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -478,6 +478,7 @@ struct llama_file_loader { | |||||||
|                 case GGML_TYPE_F16: |                 case GGML_TYPE_F16: | ||||||
|                 case GGML_TYPE_Q4_0: |                 case GGML_TYPE_Q4_0: | ||||||
|                 case GGML_TYPE_Q4_1: |                 case GGML_TYPE_Q4_1: | ||||||
|  |                 case GGML_TYPE_Q4_2: | ||||||
|                     break; |                     break; | ||||||
|                 default: { |                 default: { | ||||||
|                     throw format("unrecognized tensor type %u\n", shard.type); |                     throw format("unrecognized tensor type %u\n", shard.type); | ||||||
| @@ -550,6 +551,7 @@ struct llama_file_saver { | |||||||
|             case GGML_TYPE_F16: |             case GGML_TYPE_F16: | ||||||
|             case GGML_TYPE_Q4_0: |             case GGML_TYPE_Q4_0: | ||||||
|             case GGML_TYPE_Q4_1: |             case GGML_TYPE_Q4_1: | ||||||
|  |             case GGML_TYPE_Q4_2: | ||||||
|                 break; |                 break; | ||||||
|             default: LLAMA_ASSERT(false); |             default: LLAMA_ASSERT(false); | ||||||
|         } |         } | ||||||
| @@ -838,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { | |||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; |         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: |         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: | ||||||
|                                       return "mostly Q4_1, some F16"; |                                       return "mostly Q4_1, some F16"; | ||||||
|  |         case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; | ||||||
|         default:                      return "unknown, may not work"; |         default:                      return "unknown, may not work"; | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -1571,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|     switch (ftype) { |     switch (ftype) { | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; |         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; |         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; | ||||||
|  |         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; | ||||||
|         default: throw format("invalid output file type %d\n", ftype); |         default: throw format("invalid output file type %d\n", ftype); | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| @@ -1644,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|                     { |                     { | ||||||
|                         new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); |                         new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); | ||||||
|                     } break; |                     } break; | ||||||
|  |                 case GGML_TYPE_Q4_2: | ||||||
|  |                     { | ||||||
|  |                         new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); | ||||||
|  |                     } break; | ||||||
|                 default: |                 default: | ||||||
|                     LLAMA_ASSERT(false); |                     LLAMA_ASSERT(false); | ||||||
|             } |             } | ||||||
| @@ -1955,7 +1963,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * | |||||||
|                 base_t = dest_t; |                 base_t = dest_t; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) { |             if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) { | ||||||
|                 if (!warned) { |                 if (!warned) { | ||||||
|                     fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " |                     fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " | ||||||
|                                     "use a f16 or f32 base model with --lora-base\n", __func__); |                                     "use a f16 or f32 base model with --lora-base\n", __func__); | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								llama.h
									
									
									
									
									
								
							| @@ -72,6 +72,7 @@ extern "C" { | |||||||
|         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors |         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors | ||||||
|         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors |         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors | ||||||
|         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 |         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 | ||||||
|  |         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     LLAMA_API struct llama_context_params llama_context_default_params(); |     LLAMA_API struct llama_context_params llama_context_default_params(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov