331 lines
8.6 KiB
C
331 lines
8.6 KiB
C
#ifndef RPROF_H
|
|
#define RPROF_H
|
|
|
|
// TODO: Maybe remove `assert()`, to lower overhead? Put them behind a macro?
|
|
|
|
// Available defines for configuration:
|
|
// RPROF_IMPLEMENTATION:
|
|
// Enable implementation of library
|
|
//
|
|
// RPROF_MAX_STACK (default 128):
|
|
// To record nested blocks rprof uses a stack, this defines the maximum size of that stack
|
|
//
|
|
// RPROF_MAX_SLOTS (default 32):
|
|
// When using `rprof_start()`, you need to specify into which slot the timing will be saved.
|
|
// This defines how many slots you have available.
|
|
//
|
|
// RPROF_ONLY_TOTAL_TIME:
|
|
// Don't time block marked between `rprof_start()` and `rprof_end()`.
|
|
// Useful for checking the overhead added by the profiler.
|
|
|
|
#include <inttypes.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdbool.h>
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
#include <sys/param.h>
|
|
|
|
#ifndef RPROF_MAX_STACK
|
|
#define RPROF_MAX_STACK 128
|
|
#endif
|
|
|
|
#ifndef RPROF_MAX_SLOTS
|
|
#define RPROF_MAX_SLOTS 32
|
|
#endif
|
|
|
|
typedef struct {
|
|
char *label;
|
|
|
|
uint32_t calls;
|
|
uint64_t inclusive_duration;
|
|
uint64_t exclusive_duration;
|
|
uint64_t bytes_processed;
|
|
} rprof_slot;
|
|
|
|
typedef struct {
|
|
bool started;
|
|
bool finished;
|
|
|
|
uint64_t init_time;
|
|
uint64_t end_time;
|
|
|
|
uint32_t stack_size;
|
|
size_t slot_stack[RPROF_MAX_STACK];
|
|
uint64_t duration_stack[RPROF_MAX_STACK];
|
|
uint64_t timer_stack[RPROF_MAX_STACK];
|
|
|
|
rprof_slot slots[RPROF_MAX_SLOTS];
|
|
} rprof;
|
|
|
|
typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B);
|
|
|
|
static rprof g_rprof = { 0 };
|
|
|
|
void rprof_init();
|
|
void rprof_end();
|
|
void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed);
|
|
void rprof_stop();
|
|
|
|
int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B);
|
|
int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B);
|
|
int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B);
|
|
void rprof_output(prof_sort_cmp_cb sort_cb);
|
|
|
|
#define RPROF_START(label) rprof_start(__COUNTER__, label, 0)
|
|
#define RPROF_START_BYTES(label, bytes) rprof_start(__COUNTER__, label, bytes)
|
|
#define RPROF_STOP() rprof_stop()
|
|
|
|
#define RPROF_IMPLEMENTATION
|
|
#ifdef RPROF_IMPLEMENTATION
|
|
|
|
// ------------------------ CPU Timing -------------------------
|
|
|
|
#ifdef WIN32
|
|
#include <intrin.h>
|
|
#include <windows.h>
|
|
|
|
static uint64_t rprof_get_os_timer_hz(void)
|
|
{
|
|
LARGE_INTEGER Freq;
|
|
QueryPerformanceFrequency(&Freq);
|
|
return Freq.QuadPart;
|
|
}
|
|
|
|
static uint64_t rprof_read_os_timer(void)
|
|
{
|
|
LARGE_INTEGER Value;
|
|
QueryPerformanceCounter(&Value);
|
|
return Value.QuadPart;
|
|
}
|
|
#else
|
|
#include <x86intrin.h>
|
|
#include <time.h>
|
|
|
|
static uint64_t rprof_get_os_timer_hz(void)
|
|
{
|
|
return 1000000000;
|
|
}
|
|
|
|
static uint64_t rprof_read_os_timer(void)
|
|
{
|
|
struct timespec time;
|
|
clock_gettime(CLOCK_MONOTONIC_RAW, &time);
|
|
return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec;
|
|
}
|
|
#endif // WIN32
|
|
|
|
|
|
static uint64_t rprof_read_cpu_timer(void)
|
|
{
|
|
return __rdtsc();
|
|
}
|
|
|
|
static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms)
|
|
{
|
|
uint64_t os_freq = rprof_get_os_timer_hz();
|
|
uint64_t os_start = rprof_read_os_timer();
|
|
uint64_t os_elapsed = 0;
|
|
|
|
uint64_t cpu_start = rprof_read_cpu_timer();
|
|
|
|
uint64_t wait_duration = os_freq * measure_time_ms / 1000;
|
|
while (os_elapsed < wait_duration) {
|
|
os_elapsed = rprof_read_os_timer() - os_start;
|
|
}
|
|
|
|
uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start;
|
|
if (os_elapsed) {
|
|
return os_freq * cpu_elapsed / os_elapsed;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// ------------------------ Profiling -------------------------
|
|
|
|
void rprof_init()
|
|
{
|
|
assert(!g_rprof.started);
|
|
g_rprof.init_time = rprof_read_cpu_timer();
|
|
g_rprof.started = true;
|
|
}
|
|
|
|
void rprof_end()
|
|
{
|
|
assert(!g_rprof.finished);
|
|
g_rprof.end_time = rprof_read_cpu_timer();
|
|
g_rprof.finished = true;
|
|
}
|
|
|
|
static int rprof_cmp_u32(uint32_t A, uint32_t B)
|
|
{
|
|
if (A == B) {
|
|
return 0;
|
|
} else if (A < B) {
|
|
return 1;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B)
|
|
{
|
|
return rprof_cmp_u32((*A)->calls, (*B)->calls);
|
|
}
|
|
|
|
int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B)
|
|
{
|
|
return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration);
|
|
}
|
|
|
|
int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B)
|
|
{
|
|
return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration);
|
|
}
|
|
|
|
#ifndef RPROF_ONLY_TOTAL_TIME
|
|
|
|
#ifndef ARRAY_LEN
|
|
#define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0]))
|
|
#endif
|
|
|
|
void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed)
|
|
{
|
|
assert(g_rprof.started);
|
|
assert(!g_rprof.finished);
|
|
|
|
assert(slot_idx < RPROF_MAX_SLOTS);
|
|
assert(g_rprof.stack_size < RPROF_MAX_STACK-1);
|
|
|
|
rprof_slot *slot = &g_rprof.slots[slot_idx];
|
|
slot->label = label;
|
|
slot->calls++;
|
|
slot->bytes_processed += bytes_processed;
|
|
|
|
g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration;
|
|
g_rprof.slot_stack[g_rprof.stack_size] = slot_idx;
|
|
g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer();
|
|
g_rprof.stack_size++;
|
|
}
|
|
|
|
void rprof_stop()
|
|
{
|
|
uint64_t now = rprof_read_cpu_timer();
|
|
g_rprof.stack_size--;
|
|
uint64_t start = g_rprof.timer_stack[g_rprof.stack_size];
|
|
size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size];
|
|
size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size];
|
|
|
|
uint64_t duration = (now - start);
|
|
|
|
if (g_rprof.stack_size > 0) {
|
|
size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1];
|
|
g_rprof.slots[parent_slot].exclusive_duration -= duration;
|
|
}
|
|
|
|
g_rprof.slots[slot_idx].exclusive_duration += duration;
|
|
g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration;
|
|
}
|
|
|
|
void rprof_output(prof_sort_cmp_cb sort_cb)
|
|
{
|
|
assert(g_rprof.finished);
|
|
|
|
uint64_t total_time = g_rprof.end_time - g_rprof.init_time;
|
|
uint64_t cpu_hz = rprof_get_cpu_timer_hz(100);
|
|
float total_time_secs = (float)total_time / cpu_hz;
|
|
|
|
rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 };
|
|
uint32_t slot_count = 0;
|
|
uint64_t profiled_duration = 0;
|
|
uint32_t label_width = 0;
|
|
|
|
for (int i = 0; i < RPROF_MAX_SLOTS; i++) {
|
|
rprof_slot *slot = &g_rprof.slots[i];
|
|
if (slot->label) {
|
|
slots[slot_count] = slot;
|
|
slot_count++;
|
|
label_width = MAX(label_width, strlen(slot->label));
|
|
profiled_duration += slot->exclusive_duration;
|
|
}
|
|
}
|
|
|
|
uint64_t other_duration = total_time - profiled_duration;
|
|
rprof_slot other_slot = {
|
|
.label = "<other>",
|
|
.calls = 1,
|
|
.inclusive_duration = other_duration,
|
|
.exclusive_duration = other_duration
|
|
};
|
|
|
|
slots[slot_count++] = &other_slot;
|
|
|
|
if (sort_cb) {
|
|
qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb);
|
|
}
|
|
|
|
printf("\nTotal time taken: %.3fms (%lu) (CPU: ~%.3fGHz)\n", total_time_secs*1000, total_time, (float)cpu_hz/1000000000);
|
|
|
|
uint32_t duration_max_width = 0;
|
|
uint32_t percent_max_width = 0;
|
|
char percent_column[RPROF_MAX_SLOTS+1][128];
|
|
for (int i = 0; i < slot_count; i++) {
|
|
rprof_slot *slot = slots[i];
|
|
|
|
float percent = (float)slot->inclusive_duration*100/total_time;
|
|
float exclusive_percent = (float)slot->exclusive_duration*100/total_time;
|
|
uint32_t length;
|
|
if (slot->inclusive_duration == slot->exclusive_duration) {
|
|
length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent);
|
|
} else {
|
|
length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent);
|
|
}
|
|
percent_max_width = MAX(percent_max_width, length);
|
|
duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1);
|
|
}
|
|
|
|
char line_format[128];
|
|
snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]", label_width, duration_max_width, percent_max_width);
|
|
|
|
for (int i = 0; i < slot_count; i++) {
|
|
rprof_slot *slot = slots[i];
|
|
|
|
printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls);
|
|
if (slot->bytes_processed > 0) {
|
|
float time_spent = (float)slot->inclusive_duration / cpu_hz;
|
|
float megabytes = (float)slot->bytes_processed / (1024 * 1024);
|
|
printf(" %.3fmb", megabytes);
|
|
if (megabytes > 10) {
|
|
printf(" at %.3fgb/s", (megabytes / 1024) / time_spent);
|
|
} else {
|
|
printf(" at %.3fmb/s", megabytes / time_spent);
|
|
}
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots");
|
|
|
|
#else
|
|
|
|
#define rprof_start(...)
|
|
#define rprof_stop(...)
|
|
|
|
void rprof_output(prof_sort_cmp_cb sort_cb)
|
|
{
|
|
assert(g_rprof.finished);
|
|
|
|
uint64_t total_time = g_rprof.end_time - g_rprof.init_time;
|
|
uint64_t cpu_hz = rprof_get_cpu_timer_hz(100);
|
|
|
|
printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time);
|
|
}
|
|
|
|
#endif // RPROF_ONLY_TOTAL_TIME
|
|
|
|
#endif // RPROF_IMPLEMENTATION
|
|
|
|
#endif //RPROF_H
|