mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	profiler: initial support for profiling graph ops
This commit is contained in:
		| @@ -120,6 +120,7 @@ option(GGML_CCACHE "ggml: use ccache if available"                   ON) | ||||
| option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON) | ||||
| option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF) | ||||
| option(GGML_GPROF                  "ggml: enable gprof"                                   OFF) | ||||
| option(GGML_GRAPH_PROFILER         "ggml: enable internal Graph and Op profiler"          OFF) | ||||
|  | ||||
| # build | ||||
| option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF) | ||||
|   | ||||
| @@ -8,6 +8,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") | ||||
|     add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>) | ||||
| endif() | ||||
|  | ||||
| if (GGML_GRAPH_PROFILER) | ||||
|     add_compile_definitions(GGML_GRAPH_PROFILER) | ||||
| endif() | ||||
|  | ||||
| if (NOT MSVC) | ||||
|     if (GGML_SANITIZE_THREAD) | ||||
|         add_compile_options(-fsanitize=thread) | ||||
| @@ -209,6 +213,8 @@ add_library(ggml-base | ||||
|             ggml-threading.h | ||||
|             ggml-quants.c | ||||
|             ggml-quants.h | ||||
|             ggml-profile.h | ||||
|             ggml-profile.cpp | ||||
|             gguf.cpp) | ||||
|  | ||||
| target_include_directories(ggml-base PRIVATE .) | ||||
|   | ||||
| @@ -13,6 +13,7 @@ | ||||
| #include "binary-ops.h" | ||||
| #include "vec.h" | ||||
| #include "ops.h" | ||||
| #include "ggml-profile.h" | ||||
| #include "ggml.h" | ||||
|  | ||||
| #if defined(_MSC_VER) || defined(__MINGW32__) | ||||
| @@ -2889,6 +2890,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { | ||||
|         struct ggml_tensor * node = cgraph->nodes[node_n]; | ||||
|  | ||||
|         ggml_graph_profile_event(cgraph, GGML_PROF_OP_START, node_n, state->ith); | ||||
|  | ||||
|         ggml_compute_forward(¶ms, node); | ||||
|  | ||||
|         if (state->ith == 0 && cplan->abort_callback && | ||||
| @@ -2897,9 +2900,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|             tp->ec    = GGML_STATUS_ABORTED; | ||||
|         } | ||||
|  | ||||
|         ggml_graph_profile_event(cgraph, GGML_PROF_OP_SYNC, node_n, state->ith); | ||||
|  | ||||
|         if (node_n + 1 < cgraph->n_nodes) { | ||||
|             ggml_barrier(state->threadpool); | ||||
|         } | ||||
|  | ||||
|         ggml_graph_profile_event(cgraph, GGML_PROF_OP_END,  node_n, state->ith); | ||||
|     } | ||||
|  | ||||
|     ggml_barrier(state->threadpool); | ||||
| @@ -3142,6 +3149,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl | ||||
|     int n_threads                               = cplan->n_threads; | ||||
|     struct ggml_threadpool * threadpool = cplan->threadpool; | ||||
|  | ||||
|     ggml_graph_profile_start(cgraph, n_threads); | ||||
|  | ||||
|     bool disposable_threadpool = false; | ||||
|  | ||||
|     if (threadpool == NULL) { | ||||
| @@ -3200,6 +3209,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl | ||||
|     // don't leave affinity set on the main thread | ||||
|     clear_numa_thread_affinity(); | ||||
|  | ||||
|     ggml_graph_profile_finish(cgraph, n_threads); | ||||
|  | ||||
|     enum ggml_status ret = threadpool->ec; | ||||
|  | ||||
|     if (disposable_threadpool) { | ||||
|   | ||||
| @@ -324,6 +324,8 @@ enum ggml_cgraph_eval_order { | ||||
|     GGML_CGRAPH_EVAL_ORDER_COUNT | ||||
| }; | ||||
|  | ||||
| struct ggml_profile_data; | ||||
|  | ||||
| struct ggml_cgraph { | ||||
|     int size;    // maximum number of nodes/leafs/grads/grad_accs | ||||
|     int n_nodes; // number of nodes currently in use | ||||
| @@ -335,6 +337,8 @@ struct ggml_cgraph { | ||||
|     struct ggml_tensor ** leafs;     // tensors with constant data | ||||
|     int32_t             * use_counts;// number of uses of each tensor, indexed by hash table slot | ||||
|  | ||||
|     struct ggml_profile_data * prof; | ||||
|  | ||||
|     struct ggml_hash_set visited_hash_set; | ||||
|  | ||||
|     enum ggml_cgraph_eval_order order; | ||||
|   | ||||
							
								
								
									
										177
									
								
								ggml/src/ggml-profile.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										177
									
								
								ggml/src/ggml-profile.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,177 @@ | ||||
| #include "ggml-profile.h" | ||||
|  | ||||
| #include <stdint.h> | ||||
| #include <stdlib.h> | ||||
| #include <stdio.h> | ||||
| #include <string.h> | ||||
|  | ||||
| #include <string> | ||||
| #include <chrono> | ||||
|  | ||||
| #ifdef GGML_GRAPH_PROFILER | ||||
|  | ||||
| struct ggml_profile_output { | ||||
|     const char * prefix; | ||||
|     FILE *       stream; | ||||
| }; | ||||
|  | ||||
| extern "C" void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads) | ||||
| { | ||||
|     // TODO: make this a param | ||||
|     const char *env = getenv("GGML_GRAPH_PROFILE"); | ||||
|     if (!env) { return; } | ||||
|  | ||||
|     // The number of threads may change between passes (pp vs tg). | ||||
|     // Allocate for max_n_threads for simplicity for now. | ||||
|     // TODO: use aligned allocator | ||||
|  | ||||
|     size_t node_size = sizeof(struct ggml_profile_timing) * GGML_MAX_N_THREADS; | ||||
|     size_t pvec_size = sizeof(std::intptr_t) * cg->n_nodes; | ||||
|     size_t time_size = node_size * cg->n_nodes; | ||||
|     size_t t_size    = pvec_size + time_size + sizeof(ggml_profile_output) + sizeof(ggml_profile_data); | ||||
|  | ||||
|     uint8_t * ptr = (uint8_t *) malloc(t_size); | ||||
|     if (!ptr) { | ||||
|         fprintf(stderr, "ggml-profile: failed to allocate profiling data : n_threads %d n_nodes %d\n", n_threads, cg->n_nodes); | ||||
|         return; | ||||
|     } | ||||
|     memset(ptr, 0, t_size); | ||||
|  | ||||
|     // init all pointers | ||||
|     cg->prof         = (ggml_profile_data *)    ptr; ptr += sizeof(ggml_profile_data); | ||||
|     cg->prof->output = (ggml_profile_output *)  ptr; ptr += sizeof(ggml_profile_output); | ||||
|     cg->prof->timing = (ggml_profile_timing **) ptr; ptr += pvec_size; | ||||
|     for (int i=0; i < cg->n_nodes; i++) { | ||||
|         cg->prof->timing[i] = (struct ggml_profile_timing *) ptr; ptr += node_size; | ||||
|     } | ||||
|  | ||||
|     // init the output | ||||
|     ggml_profile_output *out = cg->prof->output; | ||||
|     if (!strcmp("stderr", env) || !strcmp("1", env)) { | ||||
|         out->prefix = "ggml-profile:"; | ||||
|         out->stream = stderr; | ||||
|     } else { | ||||
|         out->prefix = ""; | ||||
|         out->stream = fopen(env, "w"); | ||||
|     } | ||||
|  | ||||
| } | ||||
|  | ||||
| extern "C" void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads) | ||||
| { | ||||
|     if (!cg->prof) { ggml_graph_profile_init(cg, n_threads); } | ||||
|     if (!cg->prof) { return; } | ||||
| } | ||||
|  | ||||
| static inline int ggml_profile_format_tensor_dims(char *str, struct ggml_tensor *t) | ||||
| { | ||||
|     return sprintf(str, "%d:%d:%d:%d", | ||||
|         (int) t->ne[0], (int) t->ne[1], (int) t->ne[3], (int) t->ne[3]); | ||||
| } | ||||
|  | ||||
| static inline void ggml_profile_format_op_dims(char *str, struct ggml_tensor *t) | ||||
| { | ||||
|     char *p = str; | ||||
|  | ||||
|     // append src0 and src1 (if any) | ||||
|     if (t->src[0]) { | ||||
|        p += ggml_profile_format_tensor_dims(p, t->src[0]); | ||||
|  | ||||
|        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { | ||||
|            p += sprintf(p, " x "); | ||||
|            p += ggml_profile_format_tensor_dims(p, t->src[i]); | ||||
|        } | ||||
|  | ||||
|        p += sprintf(p, " -> "); | ||||
|     } | ||||
|  | ||||
|     // format self dims separately for better visual alignment | ||||
|     char self[64]; | ||||
|     ggml_profile_format_tensor_dims(self, t); | ||||
|  | ||||
|     p += sprintf(p, "%12s", self); | ||||
| } | ||||
|  | ||||
| static inline void ggml_profile_format_op_types(char *str, struct ggml_tensor *t) | ||||
| { | ||||
|     char *p = str; | ||||
|  | ||||
|     // append src0 and src1 (if any) | ||||
|     if (t->src[0]) { | ||||
|        p += sprintf(p, "%s", ggml_type_name(t->src[0]->type)); | ||||
|  | ||||
|        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { | ||||
|            p += sprintf(p, " x "); | ||||
|            p += sprintf(p, "%s", ggml_type_name(t->src[i]->type)); | ||||
|        } | ||||
|  | ||||
|        p += sprintf(p, " -> "); | ||||
|     } | ||||
|  | ||||
|     p += sprintf(p, "%3s", ggml_type_name(t->type)); | ||||
| } | ||||
|  | ||||
| extern "C" void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads) | ||||
| { | ||||
|     if (!cg->prof) { return; } | ||||
|  | ||||
|     ggml_profile_output *out = cg->prof->output; | ||||
|  | ||||
|     fprintf(out->stream, "%s| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |\n", out->prefix); | ||||
|     fprintf(out->stream, "%s| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |\n", out->prefix); | ||||
|  | ||||
|     char dims[64 * GGML_MAX_SRC]; | ||||
|     char types[16 * GGML_MAX_SRC]; | ||||
|  | ||||
|     for (int i = 0; i < cg->n_nodes; i++) { | ||||
|         uint64_t p_nsec = 0; | ||||
|         uint64_t s_nsec = 0; | ||||
|         uint64_t t_nsec = 0; | ||||
|  | ||||
|         // add up per thread counters and reset them | ||||
|         for (int t=0; t < n_threads; t++) { | ||||
|             ggml_profile_timing &timing = cg->prof->timing[i][t]; | ||||
|  | ||||
|             p_nsec += timing.nsec[GGML_PROF_OP_SYNC] - timing.nsec[GGML_PROF_OP_START]; | ||||
|             s_nsec += timing.nsec[GGML_PROF_OP_END]  - timing.nsec[GGML_PROF_OP_SYNC]; | ||||
|             t_nsec += timing.nsec[GGML_PROF_OP_END]  - timing.nsec[GGML_PROF_OP_START]; | ||||
|  | ||||
|             timing.nsec[GGML_PROF_OP_START] = 0; | ||||
|             timing.nsec[GGML_PROF_OP_SYNC]  = 0; | ||||
|             timing.nsec[GGML_PROF_OP_END]   = 0; | ||||
|         } | ||||
|  | ||||
|         ggml_profile_format_op_dims(dims, cg->nodes[i]); | ||||
|         ggml_profile_format_op_types(types, cg->nodes[i]); | ||||
|  | ||||
|         fprintf(out->stream, "%s| %04d | %10s | %10lu | %10lu | %10lu | %46s | %22s | %20s |\n", out->prefix, | ||||
|             i, ggml_op_name(cg->nodes[i]->op), | ||||
|             (unsigned long) p_nsec, (unsigned long) s_nsec, (unsigned long) t_nsec, | ||||
|             dims, types, cg->nodes[i]->name); | ||||
|     } | ||||
|     fprintf(out->stream, "%s   \n", out->prefix); // empty line to split tables | ||||
| } | ||||
|  | ||||
| extern "C" void ggml_graph_profile_free(struct ggml_cgraph *cg) | ||||
| { | ||||
|     if (!cg->prof) { return; } | ||||
|  | ||||
|     ggml_profile_output *out = cg->prof->output; | ||||
|     if (out->stream != stderr) { | ||||
|         fclose(out->stream); | ||||
|     } | ||||
|  | ||||
|     free(cg->prof); cg->prof = nullptr; | ||||
| } | ||||
|  | ||||
| extern "C" void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith) | ||||
| { | ||||
|     if (!cg->prof) { return; } | ||||
|  | ||||
|     using clock = std::chrono::high_resolution_clock; | ||||
|  | ||||
|     ggml_profile_timing &timing = cg->prof->timing[node_n][ith]; | ||||
|     timing.nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); | ||||
| } | ||||
|  | ||||
| #endif // GGML_GRAPH_PROFILER | ||||
							
								
								
									
										90
									
								
								ggml/src/ggml-profile.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										90
									
								
								ggml/src/ggml-profile.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,90 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include "ggml-impl.h" | ||||
|  | ||||
| // GGML internal header | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
|  | ||||
| // op profile events & timing (per op / per thread) | ||||
| enum ggml_profile_event { | ||||
|     GGML_PROF_OP_START, | ||||
|     GGML_PROF_OP_SYNC, | ||||
|     GGML_PROF_OP_END | ||||
| }; | ||||
|  | ||||
| struct ggml_profile_timing { | ||||
|     uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec | ||||
| }; | ||||
|  | ||||
| struct ggml_profile_output; | ||||
|  | ||||
| struct ggml_profile_data { | ||||
|     struct ggml_profile_output *output; | ||||
|     struct ggml_profile_timing ** timing; // per op / per thread timing | ||||
| }; | ||||
|  | ||||
| // check if profiling is enabled for this graph | ||||
| static inline bool ggml_graph_profile_enabled(const struct ggml_cgraph *cg) | ||||
| { | ||||
|     return cg->prof != NULL; | ||||
| } | ||||
|  | ||||
| // get pointer to the timing data for specific node / thread | ||||
| // can be used by the backends to populate data collected internally | ||||
| static inline struct ggml_profile_timing * ggml_graph_profile_timing(const struct ggml_cgraph *cg, int node_n, int ith) | ||||
| { | ||||
|     if (!cg->prof) { return NULL; } | ||||
|     return &cg->prof->timing[node_n][ith]; | ||||
| } | ||||
|  | ||||
| #ifndef GGML_GRAPH_PROFILER | ||||
|  | ||||
| // Stub out all profiler functions | ||||
|  | ||||
| static inline void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads) | ||||
| { | ||||
|     GGML_UNUSED(cg); | ||||
|     GGML_UNUSED(n_threads); | ||||
| } | ||||
|  | ||||
| static inline void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads) | ||||
| { | ||||
|     GGML_UNUSED(cg); | ||||
|     GGML_UNUSED(n_threads); | ||||
| } | ||||
|  | ||||
| static inline void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads) | ||||
| { | ||||
|     GGML_UNUSED(cg); | ||||
|     GGML_UNUSED(n_threads); | ||||
| } | ||||
|  | ||||
| static inline void ggml_graph_profile_free(struct ggml_cgraph *cg) | ||||
| { | ||||
|     GGML_UNUSED(cg); | ||||
| } | ||||
|  | ||||
| static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith) | ||||
| { | ||||
|     GGML_UNUSED(cg); | ||||
|     GGML_UNUSED(e); | ||||
|     GGML_UNUSED(node_n); | ||||
|     GGML_UNUSED(ith); | ||||
| } | ||||
|  | ||||
| #else | ||||
|  | ||||
| void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads); | ||||
| void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads); | ||||
| void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads); | ||||
| void ggml_graph_profile_free(struct ggml_cgraph *cg); | ||||
| void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith); | ||||
|  | ||||
| #endif // GGML_GRAPH_PROFILER | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
| @@ -9,6 +9,7 @@ | ||||
|  | ||||
| // FIXME: required here for quantization functions | ||||
| #include "ggml-quants.h" | ||||
| #include "ggml-profile.h" | ||||
|  | ||||
| #ifdef GGML_USE_CPU_HBM | ||||
| #include <hbwmalloc.h> | ||||
| @@ -6736,6 +6737,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz | ||||
|         /*.grad_accs    =*/ grad_accs_ptr, | ||||
|         /*.leafs        =*/ leafs_ptr, | ||||
|         /*.use_counts   =*/ use_counts_ptr, | ||||
|         /*.prof         =*/ NULL, | ||||
|         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr }, | ||||
|         /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, | ||||
|     }; | ||||
| @@ -6763,6 +6765,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) | ||||
|         /*.grad_accs        =*/ NULL, | ||||
|         /*.leafs            =*/ NULL, | ||||
|         /*.use_counts       =*/ cgraph0->use_counts, | ||||
|         /*.prof             =*/ NULL, | ||||
|         /*.visited_hash_set =*/ cgraph0->visited_hash_set, | ||||
|         /*.order            =*/ cgraph0->order, | ||||
|     }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Max Krasnyansky
					Max Krasnyansky