mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : change ggml_graph_compute() API to not require context (#1999)
* ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 * rewrite: no longer consider backward compitability; plan and make_plan * minor: rename ctx as plan; const * remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward * add static ggml_graph_compute_sugar() * minor: update comments * reusable buffers * ggml : more consistent naming + metal fixes * ggml : fix docs * tests : disable grad / opt + minor naming changes * ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste * ci : enable test-grad0 * examples : factor out plan allocation into a helper function * llama : factor out plan stuff into a helper function * ci : fix env * llama : fix duplicate symbols + refactor example benchmark * ggml : remove obsolete assert + refactor n_tasks section * ggml : fix indentation in switch * llama : avoid unnecessary bool * ggml : remove comments from source file and match order in header --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -20,6 +20,17 @@ | ||||
| #pragma warning(disable: 4244 4267) // possible loss of data | ||||
| #endif | ||||
|  | ||||
| void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { | ||||
|     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); | ||||
|  | ||||
|     if (plan.work_size > 0) { | ||||
|         buf.resize(plan.work_size); | ||||
|         plan.work_data = buf.data(); | ||||
|     } | ||||
|  | ||||
|     ggml_graph_compute(graph, &plan); | ||||
| } | ||||
|  | ||||
| float tensor_sum_elements(const ggml_tensor * tensor) { | ||||
|     float sum = 0; | ||||
|     if (tensor->type==GGML_TYPE_F32) { | ||||
| @@ -159,13 +170,14 @@ int main(int argc, char ** argv)  { | ||||
|     // printf("Creating compute graph\n"); | ||||
|     struct ggml_cgraph gf = ggml_build_forward(m11xm2); | ||||
|  | ||||
|     gf.n_threads=benchmark_params.n_threads; | ||||
|     printf("cgraph->n_threads=%i\n",gf.n_threads); | ||||
|     printf("n_threads=%i\n", benchmark_params.n_threads); | ||||
|  | ||||
|     TENSOR_DUMP(m11); | ||||
|     TENSOR_DUMP(m2); | ||||
|  | ||||
|     ggml_graph_compute(ctx, &gf); | ||||
|     std::vector<uint8_t> work_buffer; | ||||
|  | ||||
|     ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); | ||||
|  | ||||
|     TENSOR_DUMP(gf.nodes[0]); | ||||
|  | ||||
| @@ -187,7 +199,6 @@ int main(int argc, char ** argv)  { | ||||
|  | ||||
|     // printf("Creating compute graph\n"); | ||||
|     struct ggml_cgraph gf31 = ggml_build_forward(q31); | ||||
|     gf31.n_threads=benchmark_params.n_threads; | ||||
|  | ||||
|     // Set up a second graph computation to make sure we override the CPU cache lines | ||||
|     // printf("Creating new tensor q12 & Running quantize\n"); | ||||
| @@ -199,8 +210,7 @@ int main(int argc, char ** argv)  { | ||||
|  | ||||
|     //printf("Creating compute graph\n"); | ||||
|     struct ggml_cgraph gf32 = ggml_build_forward(q32); | ||||
|     gf32.n_threads=benchmark_params.n_threads; | ||||
|     printf("cgraph->n_threads=%i\n",gf31.n_threads); | ||||
|     printf("n_threads=%i\n", benchmark_params.n_threads); | ||||
|  | ||||
|     const int dimx = sizex; | ||||
|     const int dimy = sizey; | ||||
| @@ -221,14 +231,15 @@ int main(int argc, char ** argv)  { | ||||
|  | ||||
|         long long int start = ggml_time_us(); | ||||
|         //printf("Running ggml_graph_compute\n"); | ||||
|         ggml_graph_compute(ctx, &gf31); | ||||
|         ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); | ||||
|  | ||||
|         long long int stop = ggml_time_us(); | ||||
|         long long int usec = stop-start; | ||||
|         double gflops = (double)(flops_per_matrix)/usec/1000.0; | ||||
|         gflops_sum += gflops; | ||||
|         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n", | ||||
|             i, | ||||
|             gf31.n_threads, | ||||
|             benchmark_params.n_threads, | ||||
|             sizex, sizey, sizez, flops_per_matrix, | ||||
|             usec,gflops); | ||||
|  | ||||
| @@ -253,7 +264,7 @@ int main(int argc, char ** argv)  { | ||||
|         } | ||||
|  | ||||
|         // Running a different graph computation to make sure we override the CPU cache lines | ||||
|         ggml_graph_compute(ctx, &gf32); | ||||
|         ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); | ||||
|     } | ||||
|     printf("\n"); | ||||
|     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Qingyou Meng
					Qingyou Meng