From ca52f12fda854b99624deec2e8b6c88f51ef9c03 Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Thu, 20 Jul 2023 16:02:13 +0300 Subject: [PATCH] add profiler --- src/rprof.h | 311 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 src/rprof.h diff --git a/src/rprof.h b/src/rprof.h new file mode 100644 index 0000000..5dc1712 --- /dev/null +++ b/src/rprof.h @@ -0,0 +1,311 @@ +#ifndef RPROF_H +#define RPROF_H + +// TODO: Maybe remove `assert()`, to lower overhead? Put them behind a macro? + +// Available defines for configuration: +// RPROF_IMPLEMENTATION: +// Enable implementation of library +// +// RPROF_MAX_STACK (default 128): +// To record nested blocks rprof uses a stack, this defines the maximum size of that stack +// +// RPROF_MAX_SLOTS (default 32): +// When using `rprof_start()`, you need to specify into which slot the timing will be saved. +// This defines how many slots you have available. +// +// RPROF_ONLY_TOTAL_TIME: +// Don't time block marked between `rprof_start()` and `rprof_end()`. +// Useful for checking the overhead added by the profiler. + +#include +#include +#include +#include +#include +#include +#include + +#ifndef RPROF_MAX_STACK + #define RPROF_MAX_STACK 128 +#endif + +#ifndef RPROF_MAX_SLOTS + #define RPROF_MAX_SLOTS 32 +#endif + +typedef struct { + const char *label; + + uint32_t calls; + uint64_t inclusive_duration; + uint64_t exclusive_duration; +} rprof_slot; + +typedef struct { + bool started; + bool finished; + + uint64_t init_time; + uint64_t end_time; + + uint32_t stack_size; + size_t slot_stack[RPROF_MAX_STACK]; + uint64_t duration_stack[RPROF_MAX_STACK]; + uint64_t timer_stack[RPROF_MAX_STACK]; + + rprof_slot slots[RPROF_MAX_SLOTS]; +} rprof; + +typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B); +typedef int qsort_cmp(const void*,const void*); + +static rprof g_rprof = { 0 }; + +void rprof_init(); +void rprof_end(); +void rprof_start(size_t slot_idx, const char *label); +void rprof_stop(); + +int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B); +int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B); +int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B); +void rprof_output(prof_sort_cmp_cb sort_cb); + +#define RPROF_START(label) rprof_start(__COUNTER__, label) +#define RPROF_STOP() rprof_stop() + +#ifdef RPROF_IMPLEMENTATION + +// ------------------------ CPU Timing ------------------------- + +#ifdef WIN32 + #include + #include + + static uint64_t rprof_get_os_timer_hz(void) + { + LARGE_INTEGER Freq; + QueryPerformanceFrequency(&Freq); + return Freq.QuadPart; + } + + static uint64_t rprof_read_os_timer(void) + { + LARGE_INTEGER Value; + QueryPerformanceCounter(&Value); + return Value.QuadPart; + } +#else + #include + #include + + static uint64_t rprof_get_os_timer_hz(void) + { + return 1000000000; + } + + static uint64_t rprof_read_os_timer(void) + { + struct timespec time; + clock_gettime(CLOCK_MONOTONIC_RAW, &time); + return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec; + } +#endif // WIN32 + + +static uint64_t rprof_read_cpu_timer(void) +{ + return __rdtsc(); +} + +static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms) +{ + uint64_t os_freq = rprof_get_os_timer_hz(); + uint64_t os_start = rprof_read_os_timer(); + uint64_t os_elapsed = 0; + + uint64_t cpu_start = rprof_read_cpu_timer(); + + uint64_t wait_duration = os_freq * measure_time_ms / 1000; + while (os_elapsed < wait_duration) { + os_elapsed = rprof_read_os_timer() - os_start; + } + + uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start; + if (os_elapsed) { + return os_freq * cpu_elapsed / os_elapsed; + } else { + return 0; + } +} + +// ------------------------ Profiling ------------------------- + +void rprof_init() +{ + assert(!g_rprof.started); + g_rprof.init_time = rprof_read_cpu_timer(); + g_rprof.started = true; +} + +void rprof_end() +{ + assert(!g_rprof.finished); + g_rprof.end_time = rprof_read_cpu_timer(); + g_rprof.finished = true; + g_rprof.started = false; +} + +static int rprof_cmp_u32(uint32_t A, uint32_t B) +{ + if (A == B) { + return 0; + } else if (A < B) { + return 1; + } else { + return -1; + } +} + +int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->calls, (*B)->calls); +} + +int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration); +} + +int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration); +} + +#ifndef RPROF_ONLY_TOTAL_TIME + + #define RPROF_ARRAY_LEN(x) (sizeof(x)/sizeof(x[0])) + + void rprof_start(size_t slot_idx, const char *label) + { + assert(slot_idx < RPROF_MAX_SLOTS); + assert(g_rprof.stack_size < RPROF_MAX_STACK-1); + + rprof_slot *slot = &g_rprof.slots[slot_idx]; + slot->label = label; + slot->calls++; + + g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration; + g_rprof.slot_stack[g_rprof.stack_size] = slot_idx; + g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer(); + g_rprof.stack_size++; + } + + void rprof_stop() + { + uint64_t now = rprof_read_cpu_timer(); + g_rprof.stack_size--; + uint64_t start = g_rprof.timer_stack[g_rprof.stack_size]; + size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size]; + size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size]; + + uint64_t duration = (now - start); + + if (g_rprof.stack_size > 0) { + size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1]; + g_rprof.slots[parent_slot].exclusive_duration -= duration; + } + + g_rprof.slots[slot_idx].exclusive_duration += duration; + g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration; + } + + void rprof_output(prof_sort_cmp_cb sort_cb) + { + assert(g_rprof.finished); + + uint64_t total_time = g_rprof.end_time - g_rprof.init_time; + uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + + rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 }; + uint32_t slot_count = 0; + uint64_t profiled_duration = 0; + uint32_t label_width = 0; + + for (int i = 0; i < RPROF_MAX_SLOTS; i++) { + rprof_slot *slot = &g_rprof.slots[i]; + if (slot->label) { + slots[slot_count] = slot; + slot_count++; + label_width = MAX(label_width, strlen(slot->label)); + profiled_duration += slot->exclusive_duration; + } + } + + uint64_t other_duration = total_time - profiled_duration; + rprof_slot other_slot = { + .label = "", + .calls = 1, + .inclusive_duration = other_duration, + .exclusive_duration = other_duration + }; + + slots[slot_count++] = &other_slot; + + if (sort_cb) { + qsort(slots, slot_count, sizeof(rprof_slot*), (qsort_cmp*)sort_cb); + } + + printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); + + uint32_t duration_max_width = 0; + uint32_t percent_max_width = 0; + char percent_column[RPROF_MAX_SLOTS+1][128]; + for (int i = 0; i < slot_count; i++) { + rprof_slot *slot = slots[i]; + + float percent = (float)slot->inclusive_duration*100/total_time; + float exclusive_percent = (float)slot->exclusive_duration*100/total_time; + uint32_t length; + if (slot->inclusive_duration == slot->exclusive_duration) { + length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent); + } else { + length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent); + } + percent_max_width = MAX(percent_max_width, length); + duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1); + } + + char line_format[128]; + snprintf(line_format, RPROF_ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]\n", label_width, duration_max_width, percent_max_width); + + for (int i = 0; i < slot_count; i++) { + rprof_slot *slot = slots[i]; + + printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); + } + } + + static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots"); + +#else + + #define rprof_start(...) + #define rprof_stop(...) + + void rprof_output(prof_sort_cmp_cb sort_cb) + { + assert(g_rprof.finished); + + uint64_t total_time = g_rprof.end_time - g_rprof.init_time; + uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + + printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); + } + +#endif // RPROF_ONLY_TOTAL_TIME + +#endif // RPROF_IMPLEMENTATION + +#endif //RPROF_H