mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	opencl: improve profiling (#12442)
* opencl: more profiling timing * opencl: generate trace for profiling * opencl: reduce profiling overhead * Populate profiling timing info at the end rather than after each kernel run * opencl: fix for chrome tracing
This commit is contained in:
		| @@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0; | ||||
| struct ProfilingInfo { | ||||
|     std::string op_name; | ||||
|     std::string kernel_name; | ||||
|     // Kernel execution time in nanoseconds. | ||||
|     cl_ulong duration_ns; | ||||
|  | ||||
|     cl_kernel kernel; | ||||
|     cl_event evt; | ||||
|  | ||||
|     cl_ulong cmd_queued; | ||||
|     cl_ulong cmd_submit; | ||||
|     cl_ulong cmd_start; | ||||
|     cl_ulong cmd_end; | ||||
|     cl_ulong overhead_start; | ||||
|     cl_ulong overhead_end; | ||||
|     // For the times below, see spec for clGetEventProfilingInfo | ||||
|     // The time kernel spent in cmd queue - SUBMIT - QUEUED | ||||
|     cl_ulong cmd_queued_duration_ns; | ||||
|     // The time kernel spent for submission - START - SUBMIT | ||||
|     cl_ulong cmd_submit_duration_ns; | ||||
|     // Kernel execution time in nanoseconds - END - START | ||||
|     cl_ulong cmd_duration_ns; | ||||
|     // The time for the kernel to complete - COMPLETE - END | ||||
|     cl_ulong cmd_complete_duration_ns; | ||||
|     // Total time to finish the kernel - COMPELTE - QUEUED | ||||
|     cl_ulong cmd_total_duration_ns; | ||||
|     // Global and local work sizes. | ||||
|     size_t global_size[3]; | ||||
|     size_t local_size[3]; | ||||
| @@ -903,12 +922,56 @@ static void ggml_cl2_free(void) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Populate profiling info | ||||
|     for (ProfilingInfo & info : g_profiling_info) { | ||||
|         cl_ulong cmd_queued; | ||||
|         cl_ulong cmd_submit; | ||||
|         cl_ulong cmd_start; | ||||
|         cl_ulong cmd_end; | ||||
|         cl_ulong cmd_complete; | ||||
|  | ||||
|         CL_CHECK(clWaitForEvents(1, &info.evt)); | ||||
|         CL_CHECK(clGetEventProfilingInfo( | ||||
|             info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); | ||||
|         CL_CHECK(clGetEventProfilingInfo( | ||||
|             info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); | ||||
|         CL_CHECK(clGetEventProfilingInfo( | ||||
|             info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); | ||||
|         CL_CHECK(clGetEventProfilingInfo( | ||||
|             info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); | ||||
|         CL_CHECK(clGetEventProfilingInfo( | ||||
|             info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); | ||||
|         CL_CHECK(clReleaseEvent(info.evt)); | ||||
|  | ||||
|         char kernel_name[512]; | ||||
|         CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, | ||||
|             sizeof(kernel_name), kernel_name, NULL)); | ||||
|         info.kernel_name = kernel_name; | ||||
|  | ||||
|         info.cmd_queued = cmd_queued; | ||||
|         info.cmd_submit = cmd_submit; | ||||
|         info.cmd_start  = cmd_start; | ||||
|         info.cmd_end    = cmd_end; | ||||
|  | ||||
|         info.cmd_queued_duration_ns     = cmd_submit    - cmd_queued; | ||||
|         info.cmd_submit_duration_ns     = cmd_start     - cmd_submit; | ||||
|         info.cmd_duration_ns            = cmd_end       - cmd_start; | ||||
|         info.cmd_complete_duration_ns   = cmd_complete  - cmd_end; | ||||
|         info.cmd_total_duration_ns      = cmd_complete  - cmd_queued; | ||||
|     } | ||||
|  | ||||
|     // Dump a csv | ||||
|     float total_kernel_time = 0; | ||||
|     fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n"); | ||||
|     fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); | ||||
|     for (const ProfilingInfo & info : g_profiling_info) { | ||||
|         total_kernel_time += info.duration_ns/1.e6f; | ||||
|         fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", | ||||
|             info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f, | ||||
|         total_kernel_time += info.cmd_duration_ns/1.e6f; | ||||
|         fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", | ||||
|             info.op_name.c_str(), info.kernel_name.c_str(), | ||||
|             info.cmd_queued_duration_ns/1.e6f, | ||||
|             info.cmd_submit_duration_ns/1.e6f, | ||||
|             info.cmd_duration_ns/1.e6f, | ||||
|             info.cmd_complete_duration_ns/1.e6f, | ||||
|             info.cmd_total_duration_ns/1.e6f, | ||||
|             info.global_size[0], info.global_size[1], info.global_size[2], | ||||
|             info.local_size[0], info.local_size[2], info.local_size[2], | ||||
|             info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); | ||||
| @@ -916,6 +979,27 @@ static void ggml_cl2_free(void) { | ||||
|     fclose(fperf); | ||||
|  | ||||
|     GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); | ||||
|  | ||||
|     // Dump a simple chrome trace | ||||
|     FILE* ftrace = fopen("cl_trace.json", "w"); | ||||
|     if (!ftrace) { | ||||
|         GGML_LOG_ERROR("Failed to open cl_trace.json\n"); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     fprintf(ftrace, "[\n"); | ||||
|     for (const ProfilingInfo & info : g_profiling_info) { | ||||
|         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", | ||||
|             info.kernel_name.c_str(), info.cmd_queued/1000); | ||||
|         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", | ||||
|             info.kernel_name.c_str(), info.cmd_submit/1000); | ||||
|  | ||||
|         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", | ||||
|             info.kernel_name.c_str(), info.cmd_start/1000); | ||||
|         fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", | ||||
|             info.kernel_name.c_str(), info.cmd_end/1000); | ||||
|     } | ||||
|     fclose(ftrace); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso | ||||
| // Profiling utility | ||||
| //------------------------------------------------------------------------------ | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
| void populateProfilingInfo( | ||||
| static void populateProfilingInfo( | ||||
|         ProfilingInfo& info, cl_event evt, cl_kernel kernel, | ||||
|         size_t global_size[3], size_t local_size[3], | ||||
|         const ggml_tensor * tensor) { | ||||
|     cl_ulong start; | ||||
|     cl_ulong end; | ||||
|     CL_CHECK(clWaitForEvents(1, &evt)); | ||||
|     CL_CHECK(clGetEventProfilingInfo( | ||||
|         evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL)); | ||||
|     CL_CHECK(clGetEventProfilingInfo( | ||||
|         evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); | ||||
|     info.op_name     = tensor->name; | ||||
|     info.kernel      = kernel; | ||||
|     info.evt         = evt; | ||||
|  | ||||
|     char kernel_name[512]; | ||||
|     CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, | ||||
|         sizeof(kernel_name), kernel_name, NULL)); | ||||
|  | ||||
|     info.duration_ns = end - start; | ||||
|     info.op_name = tensor->name; | ||||
|     info.kernel_name = kernel_name; | ||||
|     info.local_size[0]  = local_size[0]; | ||||
|     info.local_size[1]  = local_size[1]; | ||||
|     info.local_size[2]  = local_size[2]; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 lhez
					lhez