mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	opencl: improve profiling (#12442)
* opencl: more profiling timing * opencl: generate trace for profiling * opencl: reduce profiling overhead * Populate profiling timing info at the end rather than after each kernel run * opencl: fix for chrome tracing
This commit is contained in:
		| @@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0; | |||||||
| struct ProfilingInfo { | struct ProfilingInfo { | ||||||
|     std::string op_name; |     std::string op_name; | ||||||
|     std::string kernel_name; |     std::string kernel_name; | ||||||
|     // Kernel execution time in nanoseconds. |  | ||||||
|     cl_ulong duration_ns; |     cl_kernel kernel; | ||||||
|  |     cl_event evt; | ||||||
|  |  | ||||||
|  |     cl_ulong cmd_queued; | ||||||
|  |     cl_ulong cmd_submit; | ||||||
|  |     cl_ulong cmd_start; | ||||||
|  |     cl_ulong cmd_end; | ||||||
|  |     cl_ulong overhead_start; | ||||||
|  |     cl_ulong overhead_end; | ||||||
|  |     // For the times below, see spec for clGetEventProfilingInfo | ||||||
|  |     // The time kernel spent in cmd queue - SUBMIT - QUEUED | ||||||
|  |     cl_ulong cmd_queued_duration_ns; | ||||||
|  |     // The time kernel spent for submission - START - SUBMIT | ||||||
|  |     cl_ulong cmd_submit_duration_ns; | ||||||
|  |     // Kernel execution time in nanoseconds - END - START | ||||||
|  |     cl_ulong cmd_duration_ns; | ||||||
|  |     // The time for the kernel to complete - COMPLETE - END | ||||||
|  |     cl_ulong cmd_complete_duration_ns; | ||||||
|  |     // Total time to finish the kernel - COMPELTE - QUEUED | ||||||
|  |     cl_ulong cmd_total_duration_ns; | ||||||
|     // Global and local work sizes. |     // Global and local work sizes. | ||||||
|     size_t global_size[3]; |     size_t global_size[3]; | ||||||
|     size_t local_size[3]; |     size_t local_size[3]; | ||||||
| @@ -903,12 +922,56 @@ static void ggml_cl2_free(void) { | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // Populate profiling info | ||||||
|  |     for (ProfilingInfo & info : g_profiling_info) { | ||||||
|  |         cl_ulong cmd_queued; | ||||||
|  |         cl_ulong cmd_submit; | ||||||
|  |         cl_ulong cmd_start; | ||||||
|  |         cl_ulong cmd_end; | ||||||
|  |         cl_ulong cmd_complete; | ||||||
|  |  | ||||||
|  |         CL_CHECK(clWaitForEvents(1, &info.evt)); | ||||||
|  |         CL_CHECK(clGetEventProfilingInfo( | ||||||
|  |             info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); | ||||||
|  |         CL_CHECK(clGetEventProfilingInfo( | ||||||
|  |             info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); | ||||||
|  |         CL_CHECK(clGetEventProfilingInfo( | ||||||
|  |             info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); | ||||||
|  |         CL_CHECK(clGetEventProfilingInfo( | ||||||
|  |             info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); | ||||||
|  |         CL_CHECK(clGetEventProfilingInfo( | ||||||
|  |             info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); | ||||||
|  |         CL_CHECK(clReleaseEvent(info.evt)); | ||||||
|  |  | ||||||
|  |         char kernel_name[512]; | ||||||
|  |         CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, | ||||||
|  |             sizeof(kernel_name), kernel_name, NULL)); | ||||||
|  |         info.kernel_name = kernel_name; | ||||||
|  |  | ||||||
|  |         info.cmd_queued = cmd_queued; | ||||||
|  |         info.cmd_submit = cmd_submit; | ||||||
|  |         info.cmd_start  = cmd_start; | ||||||
|  |         info.cmd_end    = cmd_end; | ||||||
|  |  | ||||||
|  |         info.cmd_queued_duration_ns     = cmd_submit    - cmd_queued; | ||||||
|  |         info.cmd_submit_duration_ns     = cmd_start     - cmd_submit; | ||||||
|  |         info.cmd_duration_ns            = cmd_end       - cmd_start; | ||||||
|  |         info.cmd_complete_duration_ns   = cmd_complete  - cmd_end; | ||||||
|  |         info.cmd_total_duration_ns      = cmd_complete  - cmd_queued; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Dump a csv | ||||||
|     float total_kernel_time = 0; |     float total_kernel_time = 0; | ||||||
|     fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n"); |     fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); | ||||||
|     for (const ProfilingInfo & info : g_profiling_info) { |     for (const ProfilingInfo & info : g_profiling_info) { | ||||||
|         total_kernel_time += info.duration_ns/1.e6f; |         total_kernel_time += info.cmd_duration_ns/1.e6f; | ||||||
|         fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", |         fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", | ||||||
|             info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f, |             info.op_name.c_str(), info.kernel_name.c_str(), | ||||||
|  |             info.cmd_queued_duration_ns/1.e6f, | ||||||
|  |             info.cmd_submit_duration_ns/1.e6f, | ||||||
|  |             info.cmd_duration_ns/1.e6f, | ||||||
|  |             info.cmd_complete_duration_ns/1.e6f, | ||||||
|  |             info.cmd_total_duration_ns/1.e6f, | ||||||
|             info.global_size[0], info.global_size[1], info.global_size[2], |             info.global_size[0], info.global_size[1], info.global_size[2], | ||||||
|             info.local_size[0], info.local_size[2], info.local_size[2], |             info.local_size[0], info.local_size[2], info.local_size[2], | ||||||
|             info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); |             info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); | ||||||
| @@ -916,6 +979,27 @@ static void ggml_cl2_free(void) { | |||||||
|     fclose(fperf); |     fclose(fperf); | ||||||
|  |  | ||||||
|     GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); |     GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); | ||||||
|  |  | ||||||
|  |     // Dump a simple chrome trace | ||||||
|  |     FILE* ftrace = fopen("cl_trace.json", "w"); | ||||||
|  |     if (!ftrace) { | ||||||
|  |         GGML_LOG_ERROR("Failed to open cl_trace.json\n"); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fprintf(ftrace, "[\n"); | ||||||
|  |     for (const ProfilingInfo & info : g_profiling_info) { | ||||||
|  |         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", | ||||||
|  |             info.kernel_name.c_str(), info.cmd_queued/1000); | ||||||
|  |         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", | ||||||
|  |             info.kernel_name.c_str(), info.cmd_submit/1000); | ||||||
|  |  | ||||||
|  |         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", | ||||||
|  |             info.kernel_name.c_str(), info.cmd_start/1000); | ||||||
|  |         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", | ||||||
|  |             info.kernel_name.c_str(), info.cmd_end/1000); | ||||||
|  |     } | ||||||
|  |     fclose(ftrace); | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso | |||||||
| // Profiling utility | // Profiling utility | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
| #ifdef GGML_OPENCL_PROFILING | #ifdef GGML_OPENCL_PROFILING | ||||||
| void populateProfilingInfo( | static void populateProfilingInfo( | ||||||
|         ProfilingInfo& info, cl_event evt, cl_kernel kernel, |         ProfilingInfo& info, cl_event evt, cl_kernel kernel, | ||||||
|         size_t global_size[3], size_t local_size[3], |         size_t global_size[3], size_t local_size[3], | ||||||
|         const ggml_tensor * tensor) { |         const ggml_tensor * tensor) { | ||||||
|     cl_ulong start; |     info.op_name     = tensor->name; | ||||||
|     cl_ulong end; |     info.kernel      = kernel; | ||||||
|     CL_CHECK(clWaitForEvents(1, &evt)); |     info.evt         = evt; | ||||||
|     CL_CHECK(clGetEventProfilingInfo( |  | ||||||
|         evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL)); |  | ||||||
|     CL_CHECK(clGetEventProfilingInfo( |  | ||||||
|         evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); |  | ||||||
|  |  | ||||||
|     char kernel_name[512]; |  | ||||||
|     CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, |  | ||||||
|         sizeof(kernel_name), kernel_name, NULL)); |  | ||||||
|  |  | ||||||
|     info.duration_ns = end - start; |  | ||||||
|     info.op_name = tensor->name; |  | ||||||
|     info.kernel_name = kernel_name; |  | ||||||
|     info.local_size[0]  = local_size[0]; |     info.local_size[0]  = local_size[0]; | ||||||
|     info.local_size[1]  = local_size[1]; |     info.local_size[1]  = local_size[1]; | ||||||
|     info.local_size[2]  = local_size[2]; |     info.local_size[2]  = local_size[2]; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 lhez
					lhez