mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	examples : fix benchmark-matmult (#1554)
The precision for Q4_0 has degraded since #1508
This commit is contained in:
		| @@ -33,7 +33,7 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, | |||||||
| } | } | ||||||
|  |  | ||||||
| float tensor_sum_elements(const ggml_tensor * tensor) { | float tensor_sum_elements(const ggml_tensor * tensor) { | ||||||
|     float sum = 0; |     double sum = 0; | ||||||
|     if (tensor->type == GGML_TYPE_F32) { |     if (tensor->type == GGML_TYPE_F32) { | ||||||
|         for (int j = 0; j < tensor->ne[1]; j++) { |         for (int j = 0; j < tensor->ne[1]; j++) { | ||||||
|             for (int k = 0; k < tensor->ne[0]; k++) { |             for (int k = 0; k < tensor->ne[0]; k++) { | ||||||
| @@ -126,12 +126,15 @@ int main(int argc, char ** argv)  { | |||||||
|  |  | ||||||
|     //printf("Memsize required = %i\n", sizex*sizex); |     //printf("Memsize required = %i\n", sizex*sizex); | ||||||
|  |  | ||||||
|  |     // TODO: perform the bench for all types or for a user specified type | ||||||
|  |     const ggml_type qtype = GGML_TYPE_Q4_1; | ||||||
|  |  | ||||||
|     size_t ctx_size = 0; |     size_t ctx_size = 0; | ||||||
|     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); |     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); | ||||||
|     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); |     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); | ||||||
|     ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); |     ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); | ||||||
|     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); |     ctx_size += sizex*sizey*ggml_type_sizef(qtype); | ||||||
|     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); |     ctx_size += sizex*sizey*ggml_type_sizef(qtype); | ||||||
|     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS |     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS | ||||||
|     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS |     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS | ||||||
|     ctx_size += 1024*1024*16; |     ctx_size += 1024*1024*16; | ||||||
| @@ -164,7 +167,7 @@ int main(int argc, char ** argv)  { | |||||||
|     struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); |     struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); | ||||||
|     ggml_set_f32(m2, 2.0f); |     ggml_set_f32(m2, 2.0f); | ||||||
|  |  | ||||||
|     printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); |     printf("\n------ Test 1 - Matrix Mult via F32 code\n"); | ||||||
|     // printf("Creating new tensor m11xm2\n"); |     // printf("Creating new tensor m11xm2\n"); | ||||||
|     struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); |     struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); | ||||||
|  |  | ||||||
| @@ -182,17 +185,16 @@ int main(int argc, char ** argv)  { | |||||||
|  |  | ||||||
|     TENSOR_DUMP(gf.nodes[0]); |     TENSOR_DUMP(gf.nodes[0]); | ||||||
|  |  | ||||||
|     printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); |     printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); | ||||||
|  |  | ||||||
|     int32_t nelements = sizex*sizey; |     int32_t nelements = sizex*sizey; | ||||||
|     int32_t ne[2] = { sizex, sizey }; |  | ||||||
|  |  | ||||||
|     std::vector<int64_t> hist_cur(1 << 4, 0); |     std::vector<int64_t> hist_cur(1 << 4, 0); | ||||||
|  |  | ||||||
|     // Set up a the benchmark matrices |     // Set up a the benchmark matrices | ||||||
|     // printf("Creating new tensor q11 & Running quantize\n"); |     // printf("Creating new tensor q11 & Running quantize\n"); | ||||||
|     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); |     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); | ||||||
|     ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); |     ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); | ||||||
|  |  | ||||||
|     // Set up a the compute graph |     // Set up a the compute graph | ||||||
|     // printf("Creating new tensor q31\n"); |     // printf("Creating new tensor q31\n"); | ||||||
| @@ -203,8 +205,8 @@ int main(int argc, char ** argv)  { | |||||||
|  |  | ||||||
|     // Set up a second graph computation to make sure we override the CPU cache lines |     // Set up a second graph computation to make sure we override the CPU cache lines | ||||||
|     // printf("Creating new tensor q12 & Running quantize\n"); |     // printf("Creating new tensor q12 & Running quantize\n"); | ||||||
|     struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); |     struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); | ||||||
|     ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); |     ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); | ||||||
|  |  | ||||||
|     // printf("Creating new tensor q32\n"); |     // printf("Creating new tensor q32\n"); | ||||||
|     struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); |     struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); | ||||||
| @@ -221,7 +223,7 @@ int main(int argc, char ** argv)  { | |||||||
|     printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); |     printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); | ||||||
|  |  | ||||||
|  |  | ||||||
|     // Let's use the F32 result from above as a reference for the q4_0 multiplication |     // Let's use the F32 result from above as a reference for the quantized multiplication | ||||||
|     float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); |     float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); | ||||||
|  |  | ||||||
|     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); |     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov