diff --git a/src/main.c b/src/main.c index 98dbb31..d9c6a09 100644 --- a/src/main.c +++ b/src/main.c @@ -197,7 +197,7 @@ int main(int argc, char **argv) printf("Diff of reference harvensine sum: %.16f\n", harvensine_sum - reference_harvensine_sum); } - prof_output(); + prof_output(prof_cmp_by_exclusive_duration); return 0; } diff --git a/src/profiler.c b/src/profiler.c index 7ff31d9..d582016 100644 --- a/src/profiler.c +++ b/src/profiler.c @@ -15,10 +15,10 @@ typedef struct { char *label; - u32 calls; - u64 duration; - u64 children_duration; + u32 calls; + u64 inclusive_duration; + u64 exclusive_duration; } prof_slot; typedef struct { @@ -29,12 +29,15 @@ typedef struct { u64 end_time; size_t slot_stack[PROFILER_MAX_STACK]; + u64 duration_stack[PROFILER_MAX_STACK]; u64 timer_stack[PROFILER_MAX_STACK]; u32 stack_size; prof_slot slots[PROFILER_MAX_SLOTS]; } prof; +typedef int prof_sort_cmp_cb(const prof_slot **A, const prof_slot **B); + static prof g_prof = { 0 }; void prof_init() @@ -63,8 +66,9 @@ void prof_start(size_t slot_idx, char *label) slot->label = label; slot->calls++; - g_prof.slot_stack[g_prof.stack_size] = slot_idx; - g_prof.timer_stack[g_prof.stack_size] = read_cpu_timer(); + g_prof.duration_stack[g_prof.stack_size] = slot->inclusive_duration; + g_prof.slot_stack[g_prof.stack_size] = slot_idx; + g_prof.timer_stack[g_prof.stack_size] = read_cpu_timer(); g_prof.stack_size++; } @@ -74,26 +78,55 @@ void prof_stop() g_prof.stack_size--; u64 start = g_prof.timer_stack[g_prof.stack_size]; size_t slot_idx = g_prof.slot_stack[g_prof.stack_size]; + size_t inclusive_duration = g_prof.duration_stack[g_prof.stack_size]; u64 duration = (now - start); - g_prof.slots[slot_idx].duration += duration; - if (g_prof.stack_size > 0) { size_t parent_slot = g_prof.slot_stack[g_prof.stack_size-1]; - g_prof.slots[parent_slot].children_duration += duration; + g_prof.slots[parent_slot].exclusive_duration -= duration; + } + + g_prof.slots[slot_idx].exclusive_duration += duration; + g_prof.slots[slot_idx].inclusive_duration = inclusive_duration + duration; +} + +static int prof_cmp_u32(u32 A, u32 B) +{ + if (A == B) { + return 0; + } else if (A < B) { + return 1; + } else { + return -1; } } -void prof_output() +int prof_cmp_by_calls(const prof_slot **A, const prof_slot **B) +{ + return prof_cmp_u32((*A)->calls, (*B)->calls); +} + +int prof_cmp_by_exclusive_duration(const prof_slot **A, const prof_slot **B) +{ + return prof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration); +} + +int prof_cmp_by_inclusive_duration(const prof_slot **A, const prof_slot **B) +{ + return prof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration); +} + +void prof_output(prof_sort_cmp_cb sort_cb) { assert(g_prof.finished); u64 total_time = g_prof.end_time - g_prof.init_time; u64 cpu_hz = get_cpu_timer_hz(100); - prof_slot *slots[PROFILER_MAX_SLOTS] = { 0 }; + prof_slot *slots[PROFILER_MAX_SLOTS+1] = { 0 }; u32 slot_count = 0; + u64 profiled_duration = 0; u32 label_width = 0; for (int i = 0; i < PROFILER_MAX_SLOTS; i++) { @@ -102,31 +135,54 @@ void prof_output() slots[slot_count] = slot; slot_count++; label_width = MAX(label_width, strlen(slot->label)); + profiled_duration += slot->exclusive_duration; } } + u64 other_duration = total_time - profiled_duration; + prof_slot other_slot = { + .label = "", + .calls = 1, + .inclusive_duration = other_duration, + .exclusive_duration = other_duration + }; + + slots[slot_count++] = &other_slot; + + if (sort_cb) { + qsort(slots, slot_count, sizeof(prof_slot*), (void*)sort_cb); + } + printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); - char line_format[128]; - snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%9lu (%%6.3f%%%%, %%7.3f%%%% w/children) [%%d]\n", label_width); - - u64 profiled_duration = 0; + u32 percent_max_width = 0; + char percent_column[PROFILER_MAX_SLOTS+1][128]; for (int i = 0; i < slot_count; i++) { prof_slot *slot = slots[i]; - f32 percent = (f32)slot->duration*100/total_time; - u64 exclusive_duration = slot->duration - slot->children_duration; - f32 exclusive_percent = (f32)exclusive_duration*100/total_time; - profiled_duration += exclusive_duration; - printf(line_format, slot->label, slot->duration, exclusive_percent, percent, slot->calls); + f32 percent = (f32)slot->inclusive_duration*100/total_time; + f32 exclusive_percent = (f32)slot->exclusive_duration*100/total_time; + u32 length; + if (slot->inclusive_duration == slot->exclusive_duration) { + length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent); + } else { + length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent); + } + percent_max_width = MAX(percent_max_width, length); } - u64 other_duration = total_time - profiled_duration; - f32 other_percent = (f32)other_duration*100/total_time; - printf(line_format, "Other", other_duration, other_percent, other_percent, 1); + char line_format[128]; + snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%9lu %%-%ds [%%d]\n", label_width, percent_max_width); + + for (int i = 0; i < slot_count; i++) { + prof_slot *slot = slots[i]; + + printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); + } } #define PROF_START(label) prof_start(__COUNTER__, label) #define PROF_STOP() prof_stop() +static_assert(__COUNTER__ < PROFILER_MAX_SLOTS, "__COUNTER__ reached max profiler slots"); #endif //PROFILER_H_