#ifndef RPROF_H #define RPROF_H // TODO: Maybe remove `assert()`, to lower overhead? Put them behind a macro? // Available defines for configuration: // RPROF_IMPLEMENTATION: // Enable implementation of library // // RPROF_MAX_STACK (default 128): // To record nested blocks rprof uses a stack, this defines the maximum size of that stack // // RPROF_MAX_SLOTS (default 32): // When using `rprof_start()`, you need to specify into which slot the timing will be saved. // This defines how many slots you have available. // // RPROF_ONLY_TOTAL_TIME: // Don't time block marked between `rprof_start()` and `rprof_end()`. // Useful for checking the overhead added by the profiler. #include #include #include #include #include #include #include #ifndef RPROF_MAX_STACK #define RPROF_MAX_STACK 128 #endif #ifndef RPROF_MAX_SLOTS #define RPROF_MAX_SLOTS 32 #endif typedef struct { char *label; uint32_t calls; uint64_t inclusive_duration; uint64_t exclusive_duration; uint64_t bytes_processed; } rprof_slot; typedef struct { bool started; bool finished; uint64_t init_time; uint64_t end_time; uint32_t stack_size; size_t slot_stack[RPROF_MAX_STACK]; uint64_t duration_stack[RPROF_MAX_STACK]; uint64_t timer_stack[RPROF_MAX_STACK]; rprof_slot slots[RPROF_MAX_SLOTS]; } rprof; typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B); static rprof g_rprof = { 0 }; void rprof_init(); void rprof_end(); void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed); void rprof_stop(); int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B); int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B); int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B); void rprof_output(prof_sort_cmp_cb sort_cb); #define RPROF_START(label) rprof_start(__COUNTER__, label, 0) #define RPROF_START_BYTES(label, bytes) rprof_start(__COUNTER__, label, bytes) #define RPROF_STOP() rprof_stop() #define RPROF_IMPLEMENTATION #ifdef RPROF_IMPLEMENTATION // ------------------------ CPU Timing ------------------------- #ifdef WIN32 #include #include static uint64_t rprof_get_os_timer_hz(void) { LARGE_INTEGER Freq; QueryPerformanceFrequency(&Freq); return Freq.QuadPart; } static uint64_t rprof_read_os_timer(void) { LARGE_INTEGER Value; QueryPerformanceCounter(&Value); return Value.QuadPart; } #else #include #include static uint64_t rprof_get_os_timer_hz(void) { return 1000000000; } static uint64_t rprof_read_os_timer(void) { struct timespec time; clock_gettime(CLOCK_MONOTONIC_RAW, &time); return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec; } #endif // WIN32 static uint64_t rprof_read_cpu_timer(void) { return __rdtsc(); } static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms) { uint64_t os_freq = rprof_get_os_timer_hz(); uint64_t os_start = rprof_read_os_timer(); uint64_t os_elapsed = 0; uint64_t cpu_start = rprof_read_cpu_timer(); uint64_t wait_duration = os_freq * measure_time_ms / 1000; while (os_elapsed < wait_duration) { os_elapsed = rprof_read_os_timer() - os_start; } uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start; if (os_elapsed) { return os_freq * cpu_elapsed / os_elapsed; } else { return 0; } } // ------------------------ Profiling ------------------------- void rprof_init() { assert(!g_rprof.started); g_rprof.init_time = rprof_read_cpu_timer(); g_rprof.started = true; } void rprof_end() { assert(!g_rprof.finished); g_rprof.end_time = rprof_read_cpu_timer(); g_rprof.finished = true; } static int rprof_cmp_u32(uint32_t A, uint32_t B) { if (A == B) { return 0; } else if (A < B) { return 1; } else { return -1; } } int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B) { return rprof_cmp_u32((*A)->calls, (*B)->calls); } int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B) { return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration); } int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) { return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration); } #ifndef RPROF_ONLY_TOTAL_TIME #ifndef ARRAY_LEN #define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0])) #endif void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed) { assert(g_rprof.started); assert(!g_rprof.finished); assert(slot_idx < RPROF_MAX_SLOTS); assert(g_rprof.stack_size < RPROF_MAX_STACK-1); rprof_slot *slot = &g_rprof.slots[slot_idx]; slot->label = label; slot->calls++; slot->bytes_processed += bytes_processed; g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration; g_rprof.slot_stack[g_rprof.stack_size] = slot_idx; g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer(); g_rprof.stack_size++; } void rprof_stop() { uint64_t now = rprof_read_cpu_timer(); g_rprof.stack_size--; uint64_t start = g_rprof.timer_stack[g_rprof.stack_size]; size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size]; size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size]; uint64_t duration = (now - start); if (g_rprof.stack_size > 0) { size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1]; g_rprof.slots[parent_slot].exclusive_duration -= duration; } g_rprof.slots[slot_idx].exclusive_duration += duration; g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration; } void rprof_output(prof_sort_cmp_cb sort_cb) { assert(g_rprof.finished); uint64_t total_time = g_rprof.end_time - g_rprof.init_time; uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); float total_time_secs = (float)total_time / cpu_hz; rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 }; uint32_t slot_count = 0; uint64_t profiled_duration = 0; uint32_t label_width = 0; for (int i = 0; i < RPROF_MAX_SLOTS; i++) { rprof_slot *slot = &g_rprof.slots[i]; if (slot->label) { slots[slot_count] = slot; slot_count++; label_width = MAX(label_width, strlen(slot->label)); profiled_duration += slot->exclusive_duration; } } uint64_t other_duration = total_time - profiled_duration; rprof_slot other_slot = { .label = "", .calls = 1, .inclusive_duration = other_duration, .exclusive_duration = other_duration }; slots[slot_count++] = &other_slot; if (sort_cb) { qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb); } printf("\nTotal time taken: %.3fms (%lu) (CPU: ~%.3fGHz)\n", total_time_secs*1000, total_time, (float)cpu_hz/1000000000); uint32_t duration_max_width = 0; uint32_t percent_max_width = 0; char percent_column[RPROF_MAX_SLOTS+1][128]; for (int i = 0; i < slot_count; i++) { rprof_slot *slot = slots[i]; float percent = (float)slot->inclusive_duration*100/total_time; float exclusive_percent = (float)slot->exclusive_duration*100/total_time; uint32_t length; if (slot->inclusive_duration == slot->exclusive_duration) { length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent); } else { length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent); } percent_max_width = MAX(percent_max_width, length); duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1); } char line_format[128]; snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]", label_width, duration_max_width, percent_max_width); for (int i = 0; i < slot_count; i++) { rprof_slot *slot = slots[i]; printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); if (slot->bytes_processed > 0) { float time_spent = (float)slot->inclusive_duration / cpu_hz; float megabytes = (float)slot->bytes_processed / (1024 * 1024); printf(" %.3fmb", megabytes); if (megabytes > 10) { printf(" at %.3fgb/s", (megabytes / 1024) / time_spent); } else { printf(" at %.3fmb/s", megabytes / time_spent); } } printf("\n"); } } static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots"); #else #define rprof_start(...) #define rprof_stop(...) void rprof_output(prof_sort_cmp_cb sort_cb) { assert(g_rprof.finished); uint64_t total_time = g_rprof.end_time - g_rprof.init_time; uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); } #endif // RPROF_ONLY_TOTAL_TIME #endif // RPROF_IMPLEMENTATION #endif //RPROF_H