mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Retire the ggml_mul_mat() branch for transposed src0 (#500)
* Retire the ggml_mul_mat() for transposed src0 - It can always be made contiguous with ggml_cpy() - The code is now simplified - The results are deterministic in respect to num threads * SIMD-ify dequantize_row_q4_0() for ARM_NEON (#502) * Attempt to SIMD-ify dequantize_row_q4_0() for ARM_NEON * Fix dequantization - forgot to interleave the quants
This commit is contained in:
		
							
								
								
									
										609
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										609
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -496,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric | |||||||
| void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | ||||||
|     assert(k % QK == 0); |     assert(k % QK == 0); | ||||||
|  |  | ||||||
| #if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) | #if defined(__ARM_NEON) || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) | ||||||
|     const int nb = k / QK; |     const int nb = k / QK; | ||||||
|     const size_t bs = sizeof(float) + QK/2; |     const size_t bs = sizeof(float) + QK/2; | ||||||
|  |  | ||||||
| @@ -507,7 +507,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #if defined(__POWER9_VECTOR__) | #if defined(__POWER9_VECTOR__) | ||||||
| #if QK == 32 |  | ||||||
|     const vector float v85 = vec_splats(8.5f); |     const vector float v85 = vec_splats(8.5f); | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
|         float amax = 0.0f; // absolute max |         float amax = 0.0f; // absolute max | ||||||
| @@ -548,11 +547,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||||||
|         //memcpy(pb, pp, sizeof(pp)); |         //memcpy(pb, pp, sizeof(pp)); | ||||||
|         pb += bs; |         pb += bs; | ||||||
|     } |     } | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #elif __ARM_NEON | #elif __ARM_NEON | ||||||
| #if QK == 32 |  | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
|         float amax = 0.0f; // absolute max |         float amax = 0.0f; // absolute max | ||||||
|  |  | ||||||
| @@ -589,11 +584,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||||||
|         memcpy(pb, pp, sizeof(pp)); |         memcpy(pb, pp, sizeof(pp)); | ||||||
|         pb += bs; |         pb += bs; | ||||||
|     } |     } | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #elif defined(__AVX2__) | #elif defined(__AVX2__) | ||||||
| #if QK == 32 |  | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
|         // Load elements into 4 AVX vectors |         // Load elements into 4 AVX vectors | ||||||
|         __m256 v0 = _mm256_loadu_ps( x ); |         __m256 v0 = _mm256_loadu_ps( x ); | ||||||
| @@ -660,11 +651,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||||||
|         _mm_storeu_si128( ( __m128i* )pb, res ); |         _mm_storeu_si128( ( __m128i* )pb, res ); | ||||||
|         pb += bs; |         pb += bs; | ||||||
|     } |     } | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #elif defined(__wasm_simd128__) | #elif defined(__wasm_simd128__) | ||||||
| #if QK == 32 |  | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
|         float amax = 0.0f; // absolute max |         float amax = 0.0f; // absolute max | ||||||
|  |  | ||||||
| @@ -701,9 +688,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||||||
|         memcpy(pb, pp, sizeof(pp)); |         memcpy(pb, pp, sizeof(pp)); | ||||||
|         pb += bs; |         pb += bs; | ||||||
|     } |     } | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #else | #else | ||||||
|     // scalar |     // scalar | ||||||
|     quantize_row_q4_0_reference(x, y, k); |     quantize_row_q4_0_reference(x, y, k); | ||||||
| @@ -771,7 +755,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { | |||||||
|     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); |     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); | ||||||
|     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); |     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); | ||||||
|  |  | ||||||
| #if defined(__AVX2__) && QK % 32 == 0 | #if defined(__AVX2__) | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
|         // scale factor |         // scale factor | ||||||
|         const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); |         const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); | ||||||
| @@ -804,6 +788,59 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  | #elif defined(__ARM_NEON) | ||||||
|  |     for (int i = 0; i < nb; i++) { | ||||||
|  |         const float d = *(const float *) (pd + i*bs); | ||||||
|  |  | ||||||
|  |         const uint8_t * restrict pp = pb + i*bs; | ||||||
|  |  | ||||||
|  |         const float32x4_t vd = vdupq_n_f32(d); | ||||||
|  |  | ||||||
|  |         for (int l = 0; l < QK; l += 16) { | ||||||
|  |             // Load 16x4-bit integers into 8x8-bit integers | ||||||
|  |             const uint8x8_t v8 = vld1_u8(pp + l/2); | ||||||
|  |  | ||||||
|  |             // Expand 4-bit nibbles to 8-bit bytes | ||||||
|  |             const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f)); | ||||||
|  |             const uint8x8_t v1 = vshr_n_u8(v8, 4); | ||||||
|  |  | ||||||
|  |             // Convert to signed 8-bit integers | ||||||
|  |             const int8x8_t vs_0 = vreinterpret_s8_u8(v0); | ||||||
|  |             const int8x8_t vs_1 = vreinterpret_s8_u8(v1); | ||||||
|  |  | ||||||
|  |             // Subtract 8 from each byte | ||||||
|  |             const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8)); | ||||||
|  |             const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8)); | ||||||
|  |  | ||||||
|  |             // Interleave and combine | ||||||
|  |             const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1); | ||||||
|  |             const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1); | ||||||
|  |  | ||||||
|  |             const int8x16_t vq = vcombine_s8(vx_0, vx_1); | ||||||
|  |  | ||||||
|  |             // convert to 2x int16x8_t | ||||||
|  |             const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq)); | ||||||
|  |             const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq)); | ||||||
|  |  | ||||||
|  |             // convert to 4x float32x4_t | ||||||
|  |             const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0))); | ||||||
|  |             const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0))); | ||||||
|  |             const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1))); | ||||||
|  |             const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1))); | ||||||
|  |  | ||||||
|  |             // Multiply by d | ||||||
|  |             const float32x4_t r0 = vmulq_f32(vf_0, vd); | ||||||
|  |             const float32x4_t r1 = vmulq_f32(vf_1, vd); | ||||||
|  |             const float32x4_t r2 = vmulq_f32(vf_2, vd); | ||||||
|  |             const float32x4_t r3 = vmulq_f32(vf_3, vd); | ||||||
|  |  | ||||||
|  |             // Store | ||||||
|  |             vst1q_f32(y + i*QK + l +  0, r0); | ||||||
|  |             vst1q_f32(y + i*QK + l +  4, r1); | ||||||
|  |             vst1q_f32(y + i*QK + l +  8, r2); | ||||||
|  |             vst1q_f32(y + i*QK + l + 12, r3); | ||||||
|  |         } | ||||||
|  |     } | ||||||
| #else | #else | ||||||
|     // scalar |     // scalar | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
| @@ -1500,8 +1537,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void | |||||||
|  |  | ||||||
|     float sumf = 0.0; |     float sumf = 0.0; | ||||||
|  |  | ||||||
| #ifdef __ARM_NEON | #if defined(__ARM_NEON) | ||||||
| #if QK == 32 |  | ||||||
|     float sum0 = 0.0f; |     float sum0 = 0.0f; | ||||||
|     float sum1 = 0.0f; |     float sum1 = 0.0f; | ||||||
|  |  | ||||||
| @@ -1600,12 +1636,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     sumf = sum0 + sum1; |     sumf = sum0 + sum1; | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #elif defined(__AVX512F__) | #elif defined(__AVX512F__) | ||||||
|  |  | ||||||
| #if QK == 32 |  | ||||||
|     // Initialize accumulator with zeros |     // Initialize accumulator with zeros | ||||||
|     __m512 acc0 = _mm512_setzero_ps(); |     __m512 acc0 = _mm512_setzero_ps(); | ||||||
|     __m512 acc1 = _mm512_setzero_ps(); |     __m512 acc1 = _mm512_setzero_ps(); | ||||||
| @@ -1634,11 +1665,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void | |||||||
|  |  | ||||||
|     // Horizontal sum of all lanes of the accumulator |     // Horizontal sum of all lanes of the accumulator | ||||||
|     sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); |     sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #elif defined(__AVX2__) | #elif defined(__AVX2__) | ||||||
| #if QK == 32 |  | ||||||
|     const size_t countBlocks = nb; |     const size_t countBlocks = nb; | ||||||
|  |  | ||||||
|     // Initialize accumulator with zeros |     // Initialize accumulator with zeros | ||||||
| @@ -1689,11 +1716,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void | |||||||
|     res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); |     res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); | ||||||
|  |  | ||||||
|     sumf = _mm_cvtss_f32( res ); |     sumf = _mm_cvtss_f32( res ); | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #elif defined(__wasm_simd128__) | #elif defined(__wasm_simd128__) | ||||||
| #if QK == 32 |  | ||||||
|     // wasm simd |     // wasm simd | ||||||
|     float sum0 = 0.0f; |     float sum0 = 0.0f; | ||||||
|     float sum1 = 0.0f; |     float sum1 = 0.0f; | ||||||
| @@ -1776,9 +1799,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     sumf = sum0 + sum1; |     sumf = sum0 + sum1; | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #else | #else | ||||||
|     // scalar |     // scalar | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
| @@ -1823,7 +1843,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void | |||||||
|     float sumf = 0.0; |     float sumf = 0.0; | ||||||
|  |  | ||||||
| #if defined(__AVX2__) | #if defined(__AVX2__) | ||||||
| #if QK == 32 |  | ||||||
|     // Initialize accumulator with zeros |     // Initialize accumulator with zeros | ||||||
|     __m256 acc = _mm256_setzero_ps(); |     __m256 acc = _mm256_setzero_ps(); | ||||||
|     // Accumulator for constant offsets |     // Accumulator for constant offsets | ||||||
| @@ -1898,9 +1917,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void | |||||||
|     res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); |     res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); | ||||||
|  |  | ||||||
|     sumf = _mm_cvtss_f32( res ) + acc_offset * QK; |     sumf = _mm_cvtss_f32( res ) + acc_offset * QK; | ||||||
| #else |  | ||||||
| #error "not implemented for QK" |  | ||||||
| #endif |  | ||||||
| #else | #else | ||||||
|     // scalar |     // scalar | ||||||
|     for (int i = 0; i < nb; i++) { |     for (int i = 0; i < nb; i++) { | ||||||
| @@ -2017,167 +2033,6 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) { |  | ||||||
| #if defined(GGML_SIMD) |  | ||||||
|     const int np = (n & ~(GGML_F16_STEP - 1)); |  | ||||||
|  |  | ||||||
|     GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); |  | ||||||
|  |  | ||||||
|     GGML_F16_VEC ax[GGML_F16_ARR]; |  | ||||||
|     GGML_F16_VEC ay[GGML_F16_ARR]; |  | ||||||
|  |  | ||||||
|     for (int i = 0; i < np; i += GGML_F16_STEP) { |  | ||||||
|         for (int j = 0; j < GGML_F16_ARR; j++) { |  | ||||||
|             ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); |  | ||||||
|             ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |  | ||||||
|             ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); |  | ||||||
|  |  | ||||||
|             GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // leftovers |  | ||||||
|     for (int i = np; i < n; ++i) { |  | ||||||
|         GGML_ASSERT(false); |  | ||||||
|         y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     for (int i = 0; i < n; ++i) { |  | ||||||
|         y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * restrict x, const float v) { |  | ||||||
|     assert(n % QK == 0); |  | ||||||
|  |  | ||||||
|     const int nb = n / QK; |  | ||||||
|     const size_t bs = sizeof(float) + QK/2; |  | ||||||
|  |  | ||||||
|     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); |  | ||||||
|     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); |  | ||||||
|  |  | ||||||
| #if __ARM_NEON |  | ||||||
| #if QK == 32 |  | ||||||
|     for (int i = 0; i < nb; ++i) { |  | ||||||
|         const float d0 = v*(*(const float *) (pd + i*bs)); |  | ||||||
|  |  | ||||||
|         const uint8_t * restrict pp = pb + i*bs; |  | ||||||
|  |  | ||||||
|         const uint8x8_t m4b = vdup_n_u8(0xf); |  | ||||||
|         const int8x8_t  s8b = vdup_n_s8(0x8); |  | ||||||
|  |  | ||||||
|         const float32x4_t vd = vdupq_n_f32(d0); |  | ||||||
|  |  | ||||||
|         for (int j = 0; j < 2; j++) { |  | ||||||
|             const uint8x8_t vx = vld1_u8(pp + j*8); |  | ||||||
|  |  | ||||||
|             const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b)); |  | ||||||
|             const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4)); |  | ||||||
|  |  | ||||||
|             // sub 8 |  | ||||||
|             const int8x8_t vxls = vsub_s8(vxl, s8b); |  | ||||||
|             const int8x8_t vxhs = vsub_s8(vxh, s8b); |  | ||||||
|  |  | ||||||
|             //const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0]; |  | ||||||
|             //const int8x8_t vxht = vzip_s8(vxls, vxhs)[1]; |  | ||||||
|             const int8x8_t vxlt = vzip1_s8(vxls, vxhs); |  | ||||||
|             const int8x8_t vxht = vzip2_s8(vxls, vxhs); |  | ||||||
|  |  | ||||||
|             const int8x16_t vxq = vcombine_s8(vxlt, vxht); |  | ||||||
|  |  | ||||||
|             // convert to 2x int16x8_t |  | ||||||
|             const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq)); |  | ||||||
|             const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq)); |  | ||||||
|  |  | ||||||
|             // convert to 4x float32x4_t |  | ||||||
|             const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0))); |  | ||||||
|             const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0))); |  | ||||||
|             const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1))); |  | ||||||
|             const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1))); |  | ||||||
|  |  | ||||||
|             const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0); |  | ||||||
|             const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4); |  | ||||||
|             const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8); |  | ||||||
|             const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12); |  | ||||||
|  |  | ||||||
|             const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd); |  | ||||||
|             const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd); |  | ||||||
|             const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd); |  | ||||||
|             const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd); |  | ||||||
|  |  | ||||||
|             vst1q_f32(y + i*32 + j*16 + 0,  vr0); |  | ||||||
|             vst1q_f32(y + i*32 + j*16 + 4,  vr1); |  | ||||||
|             vst1q_f32(y + i*32 + j*16 + 8,  vr2); |  | ||||||
|             vst1q_f32(y + i*32 + j*16 + 12, vr3); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
| #else |  | ||||||
|     // scalar |  | ||||||
|     for (int i = 0; i < nb; i++) { |  | ||||||
|         const float d = *(const float *) (pd + i*bs); |  | ||||||
|  |  | ||||||
|         const uint8_t * restrict pp = pb + i*bs; |  | ||||||
|  |  | ||||||
|         for (int l = 0; l < QK; l += 2) { |  | ||||||
|             const uint8_t vi = pp[l/2]; |  | ||||||
|  |  | ||||||
|             const int8_t vi0 = vi & 0xf; |  | ||||||
|             const int8_t vi1 = vi >> 4; |  | ||||||
|  |  | ||||||
|             const float v0 = (vi0 - 8)*d; |  | ||||||
|             const float v1 = (vi1 - 8)*d; |  | ||||||
|  |  | ||||||
|             y[i*QK + l + 0] += v0*v; |  | ||||||
|             y[i*QK + l + 1] += v1*v; |  | ||||||
|  |  | ||||||
|             assert(!isnan(y[i*QK + l + 0])); |  | ||||||
|             assert(!isnan(y[i*QK + l + 1])); |  | ||||||
|             assert(!isinf(y[i*QK + l + 0])); |  | ||||||
|             assert(!isinf(y[i*QK + l + 1])); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
| inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) { |  | ||||||
|     assert(n % QK == 0); |  | ||||||
|  |  | ||||||
|     const int nb = n / QK; |  | ||||||
|     const size_t bs = 2*sizeof(float) + QK/2; |  | ||||||
|  |  | ||||||
|     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); |  | ||||||
|     const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float)); |  | ||||||
|     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); |  | ||||||
|  |  | ||||||
|     for (int i = 0; i < nb; i++) { |  | ||||||
|         const float d = *(const float *) (pd + i*bs); |  | ||||||
|         const float m = *(const float *) (pm + i*bs); |  | ||||||
|  |  | ||||||
|         const uint8_t * restrict pp = pb + i*bs; |  | ||||||
|  |  | ||||||
|         for (int l = 0; l < QK; l += 2) { |  | ||||||
|             const uint8_t vi = pp[l/2]; |  | ||||||
|  |  | ||||||
|             const uint8_t vi0 = vi & 0xf; |  | ||||||
|             const uint8_t vi1 = vi >> 4; |  | ||||||
|  |  | ||||||
|             const float v0 = d*vi0 + m; |  | ||||||
|             const float v1 = d*vi1 + m; |  | ||||||
|  |  | ||||||
|             y[i*QK + l + 0] += v0*v; |  | ||||||
|             y[i*QK + l + 1] += v1*v; |  | ||||||
|  |  | ||||||
|             assert(!isnan(y[i*QK + l + 0])); |  | ||||||
|             assert(!isnan(y[i*QK + l + 1])); |  | ||||||
|             assert(!isinf(y[i*QK + l + 0])); |  | ||||||
|             assert(!isinf(y[i*QK + l + 1])); |  | ||||||
|             //printf("mad: v0 %f v1 %f, i = %d, l = %d, d = %f, vi = %d, vi0 = %d, vi1 = %d\n", v0, v1, i, l, d, vi, vi0, vi1); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          } | //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          } | ||||||
| inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { | inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { | ||||||
| #if defined(GGML_SIMD) | #if defined(GGML_SIMD) | ||||||
| @@ -2617,6 +2472,10 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct | |||||||
|         (t0->ne[3] == t1->ne[3]); |         (t0->ne[3] == t1->ne[3]); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { | ||||||
|  |     return tensor->nb[0] > tensor->nb[1]; | ||||||
|  | } | ||||||
|  |  | ||||||
| static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { | static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { | ||||||
|     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); |     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); | ||||||
|  |  | ||||||
| @@ -4010,6 +3869,7 @@ struct ggml_tensor * ggml_mul_mat( | |||||||
|         struct ggml_tensor  * a, |         struct ggml_tensor  * a, | ||||||
|         struct ggml_tensor  * b) { |         struct ggml_tensor  * b) { | ||||||
|     GGML_ASSERT(ggml_can_mul_mat(a, b)); |     GGML_ASSERT(ggml_can_mul_mat(a, b)); | ||||||
|  |     GGML_ASSERT(!ggml_is_transposed(a)); | ||||||
|  |  | ||||||
|     bool is_node = false; |     bool is_node = false; | ||||||
|  |  | ||||||
| @@ -5949,7 +5809,7 @@ static void ggml_compute_forward_mul_mat_f32( | |||||||
|     assert(ne3  == ne13); |     assert(ne3  == ne13); | ||||||
|  |  | ||||||
|     // TODO: we don't support permuted src0 |     // TODO: we don't support permuted src0 | ||||||
|     assert(nb00 == sizeof(float) || nb01 == sizeof(float)); |     assert(nb00 == sizeof(float)); | ||||||
|  |  | ||||||
|     // dst cannot be transposed or permuted |     // dst cannot be transposed or permuted | ||||||
|     assert(nb0 == sizeof(float)); |     assert(nb0 == sizeof(float)); | ||||||
| @@ -5964,9 +5824,6 @@ static void ggml_compute_forward_mul_mat_f32( | |||||||
|  |  | ||||||
|     // nb01 >= nb00 - src0 is not transposed |     // nb01 >= nb00 - src0 is not transposed | ||||||
|     //   compute by src0 rows |     //   compute by src0 rows | ||||||
|     // |  | ||||||
|     // nb00 <  nb01 - src0 is transposed |  | ||||||
|     //   compute by src0 columns |  | ||||||
|  |  | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
|     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||||||
| @@ -6007,42 +5864,13 @@ static void ggml_compute_forward_mul_mat_f32( | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_INIT) { |     if (params->type == GGML_TASK_INIT) { | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|             return; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // TODO: fix this memset (wsize is overestimated) |  | ||||||
|         memset(params->wdata, 0, params->wsize); |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_FINALIZE) { |     if (params->type == GGML_TASK_FINALIZE) { | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         // TODO: fix this memset (wsize is overestimated) |  | ||||||
|         //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth); |  | ||||||
|  |  | ||||||
|         float * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         // cols per thread |  | ||||||
|         const int dc = (ne + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // col range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, ne); |  | ||||||
|  |  | ||||||
|         ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); |  | ||||||
|  |  | ||||||
|         for (int k = 1; k < nth; k++) { |  | ||||||
|             ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (nb01 >= nb00) { |  | ||||||
|     // TODO: do not support transposed src1 |     // TODO: do not support transposed src1 | ||||||
|     assert(nb10 == sizeof(float)); |     assert(nb10 == sizeof(float)); | ||||||
|  |  | ||||||
| @@ -6082,53 +5910,6 @@ static void ggml_compute_forward_mul_mat_f32( | |||||||
|                     (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); |                     (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     } else { |  | ||||||
|         // parallelize by src1 columns using ggml_vec_mad_f32 |  | ||||||
|         // each thread has its own work data |  | ||||||
|         // during FINALIZE we accumulate all work data into dst |  | ||||||
|  |  | ||||||
|         // total columns in src1 |  | ||||||
|         const int nc = ne10; |  | ||||||
|  |  | ||||||
|         // columns per thread |  | ||||||
|         const int dc = (nc + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // column range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, nc); |  | ||||||
|  |  | ||||||
|         // work data for thread |  | ||||||
|         const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |  | ||||||
|         float * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         for (int i13 = 0; i13 < ne13; ++i13) { |  | ||||||
|             for (int i12 = 0; i12 < ne12; ++i12) { |  | ||||||
|                 for (int i11 = 0; i11 < ne11; ++i11) { |  | ||||||
|                     for (int ic = ic0; ic < ic1; ++ic) { |  | ||||||
|                         // src1 indices |  | ||||||
|                         const int i10 = ic; |  | ||||||
|  |  | ||||||
|                         // src0 indices |  | ||||||
|                         const int i03 = i13; |  | ||||||
|                         const int i02 = i12; |  | ||||||
|                         const int i00 = ic; |  | ||||||
|  |  | ||||||
|                         // dst indices |  | ||||||
|                         const int i1 = i11; |  | ||||||
|                         const int i2 = i12; |  | ||||||
|                         const int i3 = i13; |  | ||||||
|  |  | ||||||
|                         assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |  | ||||||
|  |  | ||||||
|                         ggml_vec_mad_f32(ne01, |  | ||||||
|                                 (float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0), |  | ||||||
|                                 (float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)), |  | ||||||
|                                *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13))); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     //int64_t t1 = ggml_perf_time_us(); |     //int64_t t1 = ggml_perf_time_us(); | ||||||
|     //static int64_t acc = 0; |     //static int64_t acc = 0; | ||||||
| @@ -6192,7 +5973,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |||||||
|     GGML_ASSERT(ne3  == ne13); |     GGML_ASSERT(ne3  == ne13); | ||||||
|  |  | ||||||
|     // TODO: we don't support permuted src0 |     // TODO: we don't support permuted src0 | ||||||
|     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t)); |     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | ||||||
|  |  | ||||||
|     // dst cannot be transposed or permuted |     // dst cannot be transposed or permuted | ||||||
|     GGML_ASSERT(nb0 == sizeof(float)); |     GGML_ASSERT(nb0 == sizeof(float)); | ||||||
| @@ -6207,9 +5988,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |||||||
|  |  | ||||||
|     // nb01 >= nb00 - src0 is not transposed |     // nb01 >= nb00 - src0 is not transposed | ||||||
|     //   compute by src0 rows |     //   compute by src0 rows | ||||||
|     // |  | ||||||
|     // nb00 <  nb01 - src0 is transposed |  | ||||||
|     //   compute by src0 columns |  | ||||||
|  |  | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
|     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||||||
| @@ -6261,7 +6039,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_INIT) { |     if (params->type == GGML_TASK_INIT) { | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         ggml_fp16_t * const wdata = params->wdata; |         ggml_fp16_t * const wdata = params->wdata; | ||||||
|  |  | ||||||
|         size_t id = 0; |         size_t id = 0; | ||||||
| @@ -6280,42 +6057,10 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         // TODO: fix this memset (wsize is overestimated) |  | ||||||
|         memset(params->wdata, 0, params->wsize); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_FINALIZE) { |     if (params->type == GGML_TASK_FINALIZE) { | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         // TODO: fix this memset (wsize is overestimated) |  | ||||||
|         //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth); |  | ||||||
|  |  | ||||||
|         ggml_fp16_t * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         // cols per thread |  | ||||||
|         const int dc = (ne + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // col range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, ne); |  | ||||||
|  |  | ||||||
|         for (int i = ic0; i < ic1; ++i) { |  | ||||||
|             ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         for (int k = 1; k < nth; k++) { |  | ||||||
|             for (int i = ic0; i < ic1; ++i) { |  | ||||||
|                 ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (nb01 >= nb00) { |  | ||||||
|     // fp16 -> half the size, so divide by 2 |     // fp16 -> half the size, so divide by 2 | ||||||
|     // TODO: do not support transposed src1 |     // TODO: do not support transposed src1 | ||||||
|     assert(nb10/2 == sizeof(ggml_fp16_t)); |     assert(nb10/2 == sizeof(ggml_fp16_t)); | ||||||
| @@ -6356,55 +6101,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( | |||||||
|             ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); |             ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     } else { |  | ||||||
|         // parallelize by src1 columns using ggml_vec_mad_f16 |  | ||||||
|         // each thread has its own work data |  | ||||||
|         // during FINALIZE we accumulate all work data into dst |  | ||||||
|  |  | ||||||
|         // total columns in src1 |  | ||||||
|         const int nc = ne10; |  | ||||||
|  |  | ||||||
|         // columns per thread |  | ||||||
|         const int dc = (nc + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // column range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, nc); |  | ||||||
|  |  | ||||||
|         // work data for thread |  | ||||||
|         const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |  | ||||||
|         ggml_fp16_t * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         for (int i13 = 0; i13 < ne13; ++i13) { |  | ||||||
|             for (int i12 = 0; i12 < ne12; ++i12) { |  | ||||||
|                 for (int i11 = 0; i11 < ne11; ++i11) { |  | ||||||
|                     // dst indices |  | ||||||
|                     const int i1 = i11; |  | ||||||
|                     const int i2 = i12; |  | ||||||
|                     const int i3 = i13; |  | ||||||
|  |  | ||||||
|                     ggml_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; |  | ||||||
|  |  | ||||||
|                     for (int ic = ic0; ic < ic1; ++ic) { |  | ||||||
|                         // src1 indices |  | ||||||
|                         const int i10 = ic; |  | ||||||
|  |  | ||||||
|                         // src0 indices |  | ||||||
|                         const int i03 = i13; |  | ||||||
|                         const int i02 = i12; |  | ||||||
|                         const int i00 = ic; |  | ||||||
|  |  | ||||||
|                         assert(sizeof(ggml_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |  | ||||||
|  |  | ||||||
|                         ggml_fp16_t * src0_col =  (ggml_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); |  | ||||||
|                         float         src1_val = *      (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); |  | ||||||
|  |  | ||||||
|                         ggml_vec_mad_f16(ne01, dst_row, src0_col, src1_val); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     //int64_t t1 = ggml_time_us(); |     //int64_t t1 = ggml_time_us(); | ||||||
|     //static int64_t acc = 0; |     //static int64_t acc = 0; | ||||||
| @@ -6467,7 +6163,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( | |||||||
|     GGML_ASSERT(ne3  == ne13); |     GGML_ASSERT(ne3  == ne13); | ||||||
|  |  | ||||||
|     // TODO: we don't support permuted src0 |     // TODO: we don't support permuted src0 | ||||||
|     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]); |     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]); | ||||||
|  |  | ||||||
|     // dst cannot be transposed or permuted |     // dst cannot be transposed or permuted | ||||||
|     GGML_ASSERT(nb0 == sizeof(float)); |     GGML_ASSERT(nb0 == sizeof(float)); | ||||||
| @@ -6482,9 +6178,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( | |||||||
|  |  | ||||||
|     // nb01 >= nb00 - src0 is not transposed |     // nb01 >= nb00 - src0 is not transposed | ||||||
|     //   compute by src0 rows |     //   compute by src0 rows | ||||||
|     // |  | ||||||
|     // nb00 <  nb01 - src0 is transposed |  | ||||||
|     //   compute by src0 columns |  | ||||||
|  |  | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
|     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||||||
| @@ -6509,9 +6202,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( | |||||||
|                 { |                 { | ||||||
|                     size_t id = 0; |                     size_t id = 0; | ||||||
|                     for (int i01 = 0; i01 < ne01; ++i01) { |                     for (int i01 = 0; i01 < ne01; ++i01) { | ||||||
|                         //for (int i00 = 0; i00 < ne00; ++i00) { |  | ||||||
|                         //    wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); |  | ||||||
|                         //} |  | ||||||
|                         dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); |                         dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); | ||||||
|                         id += ne00; |                         id += ne00; | ||||||
|                     } |                     } | ||||||
| @@ -6538,16 +6228,11 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_INIT) { |     if (params->type == GGML_TASK_INIT) { | ||||||
|         //printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth); |  | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         char * wdata = params->wdata; |         char * wdata = params->wdata; | ||||||
|  |  | ||||||
|         for (int i13 = 0; i13 < ne13; ++i13) { |         for (int i13 = 0; i13 < ne13; ++i13) { | ||||||
|             for (int i12 = 0; i12 < ne12; ++i12) { |             for (int i12 = 0; i12 < ne12; ++i12) { | ||||||
|                 for (int i11 = 0; i11 < ne11; ++i11) { |                 for (int i11 = 0; i11 < ne11; ++i11) { | ||||||
|                         //for (int i10 = 0; i10 < ne10; ++i10) { |  | ||||||
|                         //    wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); |  | ||||||
|                         //} |  | ||||||
|                     quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); |                     quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); | ||||||
|                     wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; |                     wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; | ||||||
|                 } |                 } | ||||||
| @@ -6557,35 +6242,10 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         // TODO: fix this memset (wsize is overestimated) |  | ||||||
|         memset(params->wdata, 0, params->wsize); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_FINALIZE) { |     if (params->type == GGML_TASK_FINALIZE) { | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         float * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         // cols per thread |  | ||||||
|         const int dc = (ne + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // col range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, ne); |  | ||||||
|  |  | ||||||
|         ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); |  | ||||||
|  |  | ||||||
|         for (int k = 1; k < nth; k++) { |  | ||||||
|             ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (nb01 >= nb00) { |  | ||||||
|     // TODO: do not support transposed src1 |     // TODO: do not support transposed src1 | ||||||
|  |  | ||||||
|     // parallelize by src0 rows using ggml_vec_dot_q4_0 |     // parallelize by src0 rows using ggml_vec_dot_q4_0 | ||||||
| @@ -6626,56 +6286,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( | |||||||
|             ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]))); |             ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]))); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     } else { |  | ||||||
|         //printf("AAAAA ith = %d, nth = %d\n", ith, nth); |  | ||||||
|         // parallelize by src1 columns using ggml_vec_mad_q4_0 |  | ||||||
|         // each thread has its own work data |  | ||||||
|         // during FINALIZE we accumulate all work data into dst |  | ||||||
|  |  | ||||||
|         // total columns in src1 |  | ||||||
|         const int nc = ne10; |  | ||||||
|  |  | ||||||
|         // columns per thread |  | ||||||
|         const int dc = (nc + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // column range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, nc); |  | ||||||
|  |  | ||||||
|         // work data for thread |  | ||||||
|         const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |  | ||||||
|         float * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         for (int i13 = 0; i13 < ne13; ++i13) { |  | ||||||
|             for (int i12 = 0; i12 < ne12; ++i12) { |  | ||||||
|                 for (int i11 = 0; i11 < ne11; ++i11) { |  | ||||||
|                     // dst indices |  | ||||||
|                     const int i1 = i11; |  | ||||||
|                     const int i2 = i12; |  | ||||||
|                     const int i3 = i13; |  | ||||||
|  |  | ||||||
|                     float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; |  | ||||||
|  |  | ||||||
|                     for (int ic = ic0; ic < ic1; ++ic) { |  | ||||||
|                         // src1 indices |  | ||||||
|                         const int i10 = ic; |  | ||||||
|  |  | ||||||
|                         // src0 indices |  | ||||||
|                         const int i03 = i13; |  | ||||||
|                         const int i02 = i12; |  | ||||||
|                         const int i00 = ic; |  | ||||||
|  |  | ||||||
|                         assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |  | ||||||
|  |  | ||||||
|                         void * src0_col =   (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); |  | ||||||
|                         float  src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); |  | ||||||
|  |  | ||||||
|                         ggml_vec_mad_q4_0(ne01, dst_row, src0_col, src1_val); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     //int64_t t1 = ggml_time_us(); |     //int64_t t1 = ggml_time_us(); | ||||||
|     //static int64_t acc = 0; |     //static int64_t acc = 0; | ||||||
| @@ -6738,7 +6348,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( | |||||||
|     GGML_ASSERT(ne3  == ne13); |     GGML_ASSERT(ne3  == ne13); | ||||||
|  |  | ||||||
|     // TODO: we don't support permuted src0 |     // TODO: we don't support permuted src0 | ||||||
|     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]); |     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]); | ||||||
|  |  | ||||||
|     // dst cannot be transposed or permuted |     // dst cannot be transposed or permuted | ||||||
|     GGML_ASSERT(nb0 == sizeof(float)); |     GGML_ASSERT(nb0 == sizeof(float)); | ||||||
| @@ -6753,9 +6363,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( | |||||||
|  |  | ||||||
|     // nb01 >= nb00 - src0 is not transposed |     // nb01 >= nb00 - src0 is not transposed | ||||||
|     //   compute by src0 rows |     //   compute by src0 rows | ||||||
|     // |  | ||||||
|     // nb00 <  nb01 - src0 is transposed |  | ||||||
|     //   compute by src0 columns |  | ||||||
|  |  | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
|     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { | ||||||
| @@ -6780,9 +6387,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( | |||||||
|                 { |                 { | ||||||
|                     size_t id = 0; |                     size_t id = 0; | ||||||
|                     for (int i01 = 0; i01 < ne01; ++i01) { |                     for (int i01 = 0; i01 < ne01; ++i01) { | ||||||
|                         //for (int i00 = 0; i00 < ne00; ++i00) { |  | ||||||
|                         //    wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); |  | ||||||
|                         //} |  | ||||||
|                         dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); |                         dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); | ||||||
|                         id += ne00; |                         id += ne00; | ||||||
|                     } |                     } | ||||||
| @@ -6809,8 +6413,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_INIT) { |     if (params->type == GGML_TASK_INIT) { | ||||||
|         //printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth); |  | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         char * wdata = params->wdata; |         char * wdata = params->wdata; | ||||||
|  |  | ||||||
|         for (int i13 = 0; i13 < ne13; ++i13) { |         for (int i13 = 0; i13 < ne13; ++i13) { | ||||||
| @@ -6828,35 +6430,10 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         // TODO: fix this memset (wsize is overestimated) |  | ||||||
|         memset(params->wdata, 0, params->wsize); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (params->type == GGML_TASK_FINALIZE) { |     if (params->type == GGML_TASK_FINALIZE) { | ||||||
|         if (nb01 >= nb00) { |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|         float * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         // cols per thread |  | ||||||
|         const int dc = (ne + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // col range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, ne); |  | ||||||
|  |  | ||||||
|         ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); |  | ||||||
|  |  | ||||||
|         for (int k = 1; k < nth; k++) { |  | ||||||
|             ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (nb01 >= nb00) { |  | ||||||
|     // TODO: do not support transposed src1 |     // TODO: do not support transposed src1 | ||||||
|  |  | ||||||
|     // parallelize by src0 rows using ggml_vec_dot_q4_1 |     // parallelize by src0 rows using ggml_vec_dot_q4_1 | ||||||
| @@ -6897,56 +6474,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( | |||||||
|             ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]))); |             ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]))); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     } else { |  | ||||||
|         //printf("AAAAA ith = %d, nth = %d\n", ith, nth); |  | ||||||
|         // parallelize by src1 columns using ggml_vec_mad_q4_1 |  | ||||||
|         // each thread has its own work data |  | ||||||
|         // during FINALIZE we accumulate all work data into dst |  | ||||||
|  |  | ||||||
|         // total columns in src1 |  | ||||||
|         const int nc = ne10; |  | ||||||
|  |  | ||||||
|         // columns per thread |  | ||||||
|         const int dc = (nc + nth - 1)/nth; |  | ||||||
|  |  | ||||||
|         // column range for this thread |  | ||||||
|         const int ic0 = dc*ith; |  | ||||||
|         const int ic1 = MIN(ic0 + dc, nc); |  | ||||||
|  |  | ||||||
|         // work data for thread |  | ||||||
|         const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; |  | ||||||
|         float * const wdata = params->wdata; |  | ||||||
|  |  | ||||||
|         for (int i13 = 0; i13 < ne13; ++i13) { |  | ||||||
|             for (int i12 = 0; i12 < ne12; ++i12) { |  | ||||||
|                 for (int i11 = 0; i11 < ne11; ++i11) { |  | ||||||
|                     // dst indices |  | ||||||
|                     const int i1 = i11; |  | ||||||
|                     const int i2 = i12; |  | ||||||
|                     const int i3 = i13; |  | ||||||
|  |  | ||||||
|                     float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; |  | ||||||
|  |  | ||||||
|                     for (int ic = ic0; ic < ic1; ++ic) { |  | ||||||
|                         // src1 indices |  | ||||||
|                         const int i10 = ic; |  | ||||||
|  |  | ||||||
|                         // src0 indices |  | ||||||
|                         const int i03 = i13; |  | ||||||
|                         const int i02 = i12; |  | ||||||
|                         const int i00 = ic; |  | ||||||
|  |  | ||||||
|                         assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); |  | ||||||
|  |  | ||||||
|                         void * src0_col =   (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); |  | ||||||
|                         float  src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); |  | ||||||
|  |  | ||||||
|                         ggml_vec_mad_q4_1(ne01, dst_row, src0_col, src1_val); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     //int64_t t1 = ggml_time_us(); |     //int64_t t1 = ggml_time_us(); | ||||||
|     //static int64_t acc = 0; |     //static int64_t acc = 0; | ||||||
| @@ -9588,11 +9115,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |||||||
|  |  | ||||||
|                         size_t cur = 0; |                         size_t cur = 0; | ||||||
|  |  | ||||||
|                         // TODO: better way to determine if the matrix is transposed |  | ||||||
|                         if (node->src0->nb[1] < node->src0->nb[0]) { |  | ||||||
|                             cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1) |  | ||||||
|                                                                    // TODO: overestimated by factor of x2 for FP16 |  | ||||||
|                         } else { |  | ||||||
|                         if (node->src0->type == GGML_TYPE_F16 && |                         if (node->src0->type == GGML_TYPE_F16 && | ||||||
|                                 node->src1->type == GGML_TYPE_F32) { |                                 node->src1->type == GGML_TYPE_F32) { | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
| @@ -9639,7 +9161,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |||||||
|                         } else { |                         } else { | ||||||
|                             GGML_ASSERT(false); |                             GGML_ASSERT(false); | ||||||
|                         } |                         } | ||||||
|                         } |  | ||||||
|  |  | ||||||
|                         work_size = MAX(work_size, cur); |                         work_size = MAX(work_size, cur); | ||||||
|                     } break; |                     } break; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov