commit 4720f092ae56bae84d9d07b0c22972a65f4de096 Author: Rokas Puzonas Date: Thu Feb 15 18:23:18 2024 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..378eac2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9a6a9cb --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +CFLAGS=-lm -g -Wall -O0 + +build/main: src/main.c src/repetition_tester.c build/nop_loop.o src/rprof.h + mkdir -p build + gcc -o build/main src/main.c build/nop_loop.o $(CFLAGS) + +build/nop_loop.o: src/nop_loop.asm + nasm -g -f elf64 -o build/nop_loop.o src/nop_loop.asm + +run: ./build/main + ./build/main $(TEST_NAME) + +clean: + rm -r build diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..ef35cf9 --- /dev/null +++ b/src/main.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include + +#include "repetition_tester.c" + +#define RPROF_IMPLEMENTATION +#include "rprof.h" + +#include "main_read_file.c" +#include "main_write_bytes.c" + +int main(int argc, char **argv) { + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + return -1; + } + + char *test_name = argv[1]; + + if (!strncmp(test_name, "write_bytes", sizeof("write_bytes"))) { + return main_test_write_bytes(); + } else if (!strncmp(test_name, "read_file", sizeof("read_file"))) { + if (argc < 3) { + printf("Usage: %s read_file \n", argv[0]); + return -1; + } + + return main_test_read_file(argv[2]); + } else { + printf("ERROR: Unknown test case '%s'\n", test_name); + return -1; + } +} diff --git a/src/main_read_file.c b/src/main_read_file.c new file mode 100644 index 0000000..76d46dc --- /dev/null +++ b/src/main_read_file.c @@ -0,0 +1,85 @@ +#include +#include +#include +#include + +#include "repetition_tester.c" + +uint64_t get_file_size(const char *filename) { + struct stat result; + stat(filename, &result); + return result.st_size; +} + +void read_file_with_fread(uint8_t *buffer, uint64_t buffer_size, char *filename) { + FILE *file = fopen(filename, "r"); + + uint64_t read_amount = 0; + while (read_amount < buffer_size) { + int result = fread(&buffer[read_amount], 1, buffer_size - read_amount, file); + + if (result <= 0) { + printf("ERROR: Failed to read from file\n"); + break; + } + + read_amount += result; + } + + fclose(file); +} + +void read_file_with_read(uint8_t *buffer, uint64_t buffer_size, char *filename) { + int file = open(filename, O_RDONLY); + + uint64_t read_amount = 0; + while (read_amount < buffer_size) { + int result = read(file, &buffer[read_amount], buffer_size - read_amount); + + if (result <= 0) { + printf("ERROR: Failed to read from file\n"); + break; + } + + read_amount += result; + } + + close(file); +} + +int main_test_read_file(char *filename) { + typedef void (*read_file_b)(uint8_t *buffer, uint64_t buffer_size, char *filename); + struct testcase { + char *name; + read_file_b cb; + }; + + struct testcase cases[] = { + { .name = "read()", .cb = read_file_with_read }, + { .name = "fread()", .cb = read_file_with_fread }, + }; + + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t file_size = get_file_size(filename); + uint8_t *buffer = malloc(sizeof(uint8_t) * file_size); + + while (repeat_forever(&repetitor)) { + for (int i = 0; i < ARRAY_LEN(cases); i++) { + struct testcase *testcase = &cases[i]; + repetitor_clear(&repetitor); + + while (repetitor_repeat(&repetitor, 1)) { + repetitor_start(&repetitor); + testcase->cb(buffer, file_size, filename); + repetitor_stop(&repetitor, file_size); + } + + repetitor_print_results_label(&repetitor, testcase->name); + } + } + + return 0; +} diff --git a/src/main_write_bytes.c b/src/main_write_bytes.c new file mode 100644 index 0000000..e005f36 --- /dev/null +++ b/src/main_write_bytes.c @@ -0,0 +1,38 @@ +#include "repetition_tester.c" +#include "nop_loop.h" + +int main_test_write_bytes() { + typedef void (*write_bytes_cb)(uint8_t *buffer, uint64_t byte_count); + struct testcase { + char *name; + write_bytes_cb cb; + }; + + struct testcase cases[] = { + { .name = "write bytes 3x1", .cb = writes_bytes_nop_3x1_asm }, + { .name = "write bytes 1x3", .cb = writes_bytes_nop_1x3_asm }, + { .name = "write bytes 1x9", .cb = writes_bytes_nop_1x9_asm }, + }; + + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t byte_count = 4096 * 512; + uint8_t *buffer = malloc(sizeof(uint8_t) * byte_count); + + for (int i = 0; i < ARRAY_LEN(cases); i++) { + struct testcase *testcase = &cases[i]; + repetitor_clear(&repetitor); + + while (repetitor_repeat(&repetitor, 2)) { + repetitor_start(&repetitor); + testcase->cb(buffer, byte_count); + repetitor_stop(&repetitor, byte_count); + } + + repetitor_print_results_label(&repetitor, testcase->name); + } + + return 0; +} diff --git a/src/nop_loop.asm b/src/nop_loop.asm new file mode 100644 index 0000000..597b9de --- /dev/null +++ b/src/nop_loop.asm @@ -0,0 +1,80 @@ +global writes_bytes_mov_asm +global writes_bytes_nop_3x1_asm +global writes_bytes_nop_1x3_asm +global writes_bytes_nop_1x9_asm +global writes_bytes_cmp_asm +global writes_bytes_dec_asm + +section .text + +; rsi - byte_count +; rdi - buffer +writes_bytes_mov_asm: + xor rax, rax +.loop: + mov [rdi+rax], al + inc rax + cmp rax, rsi + jb .loop + ret + +; rsi - byte_count +; rdi - buffer +writes_bytes_nop_3x1_asm: + xor rax, rax +.loop: + db 0x0f, 0x1f, 0x00 + inc rax + cmp rsi,rax + jne .loop + ret + +; rsi - byte_count +; rdi - buffer +writes_bytes_nop_1x3_asm: + xor rax, rax +.loop: + nop + nop + nop + inc rax + cmp rsi,rax + jne .loop + ret + +; rsi - byte_count +; rdi - buffer +writes_bytes_nop_1x9_asm: + xor rax, rax +.loop: + nop + nop + nop + nop + nop + nop + nop + nop + nop + inc rax + cmp rsi,rax + jne .loop + ret + +; rsi - byte_count +; rdi - buffer +writes_bytes_cmp_asm: + xor rax, rax +.loop: + inc rax + cmp rsi,rax + jne .loop + ret + +; rsi - byte_count +; rdi - buffer +writes_bytes_dec_asm: +.loop: + dec rsi + jnz .loop + ret diff --git a/src/nop_loop.h b/src/nop_loop.h new file mode 100644 index 0000000..7d57cbd --- /dev/null +++ b/src/nop_loop.h @@ -0,0 +1,8 @@ +#include + +void writes_bytes_mov_asm(uint8_t *buffer, uint64_t byte_count); +void writes_bytes_nop_3x1_asm(uint8_t *buffer, uint64_t byte_count); +void writes_bytes_nop_1x3_asm(uint8_t *buffer, uint64_t byte_count); +void writes_bytes_nop_1x9_asm(uint8_t *buffer, uint64_t byte_count); +void writes_bytes_cmp_asm(uint8_t *buffer, uint64_t byte_count); +void writes_bytes_dec_asm(uint8_t *buffer, uint64_t byte_count); diff --git a/src/repetition_tester.c b/src/repetition_tester.c new file mode 100644 index 0000000..63520d7 --- /dev/null +++ b/src/repetition_tester.c @@ -0,0 +1,182 @@ +#ifndef REPETITION_TESTER_ +#define REPETITION_TESTER_ + +#include +#include "rprof.h" + +struct repetitor { + uint64_t cpu_freq; + + uint64_t repeat_cycle; + + uint64_t repetition_count; + uint64_t time_taken; + uint64_t page_faults; + + uint64_t total_time_taken; + uint64_t total_page_faults; + uint64_t total_byte_count; + + uint64_t min_time_taken; + uint64_t min_page_faults; + uint64_t min_byte_count; + + uint64_t max_time_taken; + uint64_t max_page_faults; + uint64_t max_byte_count; + + uint64_t last_min_time_taken; + uint64_t last_found_min_time; +}; + +static uint64_t min_uint64(uint64_t a, uint64_t b) { + return a > b ? b : a; +} + +static uint64_t max_uint64(uint64_t a, uint64_t b) { + return a < b ? b : a; +} + +void repetitor_clear(struct repetitor *repetitor) { + repetitor->repetition_count = 0; + + repetitor->time_taken = 0; + repetitor->page_faults = 0; + + repetitor->total_page_faults = 0; + repetitor->total_time_taken = 0; + repetitor->total_byte_count = 0; + + repetitor->min_page_faults = 0; + repetitor->min_time_taken = 0; + repetitor->min_byte_count = 0; + + repetitor->max_page_faults = 0; + repetitor->max_time_taken = 0; + repetitor->max_byte_count = 0; + + repetitor->last_found_min_time = 0; +} + +void repetitor_init(struct repetitor *repetitor) { + repetitor->cpu_freq = rprof_get_cpu_timer_hz(100); + repetitor->repeat_cycle = 0; + repetitor_clear(repetitor); +} + +void repetitor_start(struct repetitor *repetitor) { + repetitor->page_faults -= read_page_faults(); + repetitor->time_taken -= rprof_read_cpu_timer(); +} + +void repetitor_stop(struct repetitor *repetitor, uint64_t byte_count) { + repetitor->time_taken += rprof_read_cpu_timer(); + repetitor->page_faults += read_page_faults(); + + if (repetitor->repetition_count == 0) { + repetitor->max_time_taken = repetitor->time_taken; + repetitor->max_page_faults = repetitor->page_faults; + repetitor->max_byte_count = byte_count; + repetitor->min_time_taken = repetitor->time_taken; + repetitor->min_page_faults = repetitor->page_faults; + repetitor->min_byte_count = byte_count; + } else { + repetitor->max_time_taken = max_uint64(repetitor->max_time_taken, repetitor->time_taken); + repetitor->max_page_faults = max_uint64(repetitor->max_page_faults, repetitor->page_faults); + repetitor->max_byte_count = max_uint64(repetitor->max_byte_count, byte_count); + repetitor->min_time_taken = min_uint64(repetitor->min_time_taken, repetitor->time_taken); + repetitor->min_page_faults = min_uint64(repetitor->min_page_faults, repetitor->page_faults); + repetitor->min_byte_count = min_uint64(repetitor->min_byte_count, byte_count); + } + + repetitor->total_time_taken += repetitor->time_taken; + repetitor->total_page_faults += repetitor->page_faults; + repetitor->total_byte_count += byte_count; + repetitor->repetition_count += 1; + + repetitor->time_taken = 0; + repetitor->page_faults = 0; +} + +bool repetitor_repeat(struct repetitor *repetitor, float timeout_seconds) { + if (repetitor->repetition_count == 0) { + return true; + } + + uint64_t now = rprof_read_cpu_timer(); + + if (repetitor->last_found_min_time != 0) { + uint64_t time_since_last_find = (now - repetitor->last_found_min_time); + uint64_t timeout_cycles = timeout_seconds * repetitor->cpu_freq; + if (time_since_last_find > timeout_cycles) { + return false; + } + } + + if (repetitor->min_time_taken < repetitor->last_min_time_taken || repetitor->last_found_min_time == 0) { + repetitor->last_min_time_taken = repetitor->min_time_taken; + repetitor->last_found_min_time = now; + } + + return true; +} + +bool repeat_forever(struct repetitor *repetitor) { + repetitor->repeat_cycle += 1; + printf("======= Cycle %ld ================================\n", repetitor->repeat_cycle); + return true; +} + +static float cycles_to_s(struct repetitor *repetitor, uint64_t cycles) { + return (float)cycles / repetitor->cpu_freq; +} + +static float cycles_to_ms(struct repetitor *repetitor, uint64_t cycles) { + return cycles_to_s(repetitor, cycles) * 1000; +} + +static float bytes_to_kb(uint64_t bytes) { + return (float)bytes / 1000; +} + +static float bytes_to_gb_s(struct repetitor *repetitor, uint64_t bytes, uint64_t cycles) { + return bytes_to_kb(bytes) / (1000*1000) / cycles_to_s(repetitor, cycles); +} + +static float divide_safe(float a, float b) { + return b != 0 ? a / b : 0; +} + +void repetitor_print_results(struct repetitor *repetitor) { + uint64_t avg_time_taken = repetitor->total_time_taken/repetitor->repetition_count; + uint64_t min_time_taken = repetitor->min_time_taken; + uint64_t max_time_taken = repetitor->max_time_taken; + + uint64_t avg_byte_count = repetitor->total_byte_count/repetitor->repetition_count; + uint64_t min_byte_count = repetitor->min_byte_count; + uint64_t max_byte_count = repetitor->max_byte_count; + + float avg_bandwidth = bytes_to_gb_s(repetitor, avg_byte_count, avg_time_taken); + float min_bandwidth = bytes_to_gb_s(repetitor, min_byte_count, min_time_taken); + float max_bandwidth = bytes_to_gb_s(repetitor, max_byte_count, max_time_taken); + + float avg_page_faults = (float)repetitor->total_page_faults/repetitor->repetition_count; + uint64_t min_page_faults = repetitor->min_page_faults; + uint64_t max_page_faults = repetitor->max_page_faults; + + float avg_kb_per_fault = divide_safe(bytes_to_kb(avg_byte_count), avg_page_faults); + float min_kb_per_fault = divide_safe(bytes_to_kb(repetitor->min_page_faults), min_page_faults); + float max_kb_per_fault = divide_safe(bytes_to_kb(repetitor->max_page_faults), max_page_faults); + + printf(" : %16s : %10s : %12s : %18s : %12s : %18s :\n", "Time (cycles)", "Time (ms)", "Memory (B)", "Bandwidth (GiB/s)", "Page faults", "KiB per page fault"); + printf("Best : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", min_time_taken, cycles_to_ms(repetitor, min_time_taken), min_byte_count, min_bandwidth, min_page_faults, min_kb_per_fault); + printf("Average : %16ld : %10.6f : %12ld : %18.6f : %12.6f : %18.6f :\n", avg_time_taken, cycles_to_ms(repetitor, avg_time_taken), avg_byte_count, avg_bandwidth, avg_page_faults, avg_kb_per_fault); + printf("Worst : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", max_time_taken, cycles_to_ms(repetitor, max_time_taken), max_byte_count, max_bandwidth, max_page_faults, max_kb_per_fault); +} + +void repetitor_print_results_label(struct repetitor *repetitor, char *label) { + printf("--------- %s ---------\n", label); + repetitor_print_results(repetitor); +} + +#endif //REPETITION_TESTER_ diff --git a/src/rprof.h b/src/rprof.h new file mode 100644 index 0000000..6128052 --- /dev/null +++ b/src/rprof.h @@ -0,0 +1,344 @@ +#ifndef RPROF_H +#define RPROF_H + +// TODO: Maybe remove `assert()`, to lower overhead? Put them behind a macro? + +// Available defines for configuration: +// RPROF_IMPLEMENTATION: +// Enable implementation of library +// +// RPROF_MAX_STACK (default 128): +// To record nested blocks rprof uses a stack, this defines the maximum size of that stack +// +// RPROF_MAX_SLOTS (default 32): +// When using `rprof_start()`, you need to specify into which slot the timing will be saved. +// This defines how many slots you have available. +// +// RPROF_ONLY_TOTAL_TIME: +// Don't time block marked between `rprof_start()` and `rprof_end()`. +// Useful for checking the overhead added by the profiler. + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef RPROF_MAX_STACK + #define RPROF_MAX_STACK 128 +#endif + +#ifndef RPROF_MAX_SLOTS + #define RPROF_MAX_SLOTS 32 +#endif + +typedef struct { + const char *label; + + uint32_t calls; + uint64_t inclusive_duration; + uint64_t exclusive_duration; + uint64_t bytes_processed; +} rprof_slot; + +typedef struct { + bool started; + bool finished; + + uint64_t init_time; + uint64_t end_time; + + uint32_t stack_size; + size_t slot_stack[RPROF_MAX_STACK]; + uint64_t duration_stack[RPROF_MAX_STACK]; + uint64_t timer_stack[RPROF_MAX_STACK]; + + rprof_slot slots[RPROF_MAX_SLOTS]; +} rprof; + +typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B); + +static rprof g_rprof = { 0 }; + +void rprof_init(); +void rprof_end(); +void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed); +void rprof_stop(); + +int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B); +int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B); +int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B); +void rprof_output(prof_sort_cmp_cb sort_cb); + +#define RPROF_START(label) rprof_start(__COUNTER__, label, 0) +#define RPROF_START_BYTES(label, bytes) rprof_start(__COUNTER__, label, bytes) +#define RPROF_STOP() rprof_stop() + +#define RPROF_IMPLEMENTATION +#ifdef RPROF_IMPLEMENTATION + +// ------------------------ CPU Timing ------------------------- + +#ifdef WIN32 + #include + #include + + static uint64_t rprof_get_os_timer_hz(void) + { + LARGE_INTEGER Freq; + QueryPerformanceFrequency(&Freq); + return Freq.QuadPart; + } + + static uint64_t rprof_read_os_timer(void) + { + LARGE_INTEGER Value; + QueryPerformanceCounter(&Value); + return Value.QuadPart; + } +#else + #include + #include + + static uint64_t rprof_get_os_timer_hz(void) + { + return 1000000000; + } + + static uint64_t rprof_read_os_timer(void) + { + struct timespec time; + clock_gettime(CLOCK_MONOTONIC_RAW, &time); + return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec; + } +#endif // WIN32 + + +static uint64_t rprof_read_cpu_timer(void) +{ + return __rdtsc(); +} + +static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms) +{ + uint64_t os_freq = rprof_get_os_timer_hz(); + uint64_t os_start = rprof_read_os_timer(); + uint64_t os_elapsed = 0; + + uint64_t cpu_start = rprof_read_cpu_timer(); + + uint64_t wait_duration = os_freq * measure_time_ms / 1000; + while (os_elapsed < wait_duration) { + os_elapsed = rprof_read_os_timer() - os_start; + } + + uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start; + if (os_elapsed) { + return os_freq * cpu_elapsed / os_elapsed; + } else { + return 0; + } +} + +// ------------------------ Page faults ------------------------- + +#include + +uint64_t read_page_faults() { + // NOTE: ru_minflt the number of page faults serviced without any I/O activity. + // ru_majflt the number of page faults serviced that required I/O activity. + struct rusage usage = {}; + getrusage(RUSAGE_SELF, &usage); + int Result = usage.ru_minflt + usage.ru_majflt; + return Result; +} + +// ------------------------ Profiling ------------------------- + +void rprof_init() +{ + assert(!g_rprof.started); + g_rprof.init_time = rprof_read_cpu_timer(); + g_rprof.started = true; +} + +void rprof_end() +{ + assert(!g_rprof.finished); + g_rprof.end_time = rprof_read_cpu_timer(); + g_rprof.finished = true; +} + +static int rprof_cmp_u32(uint32_t A, uint32_t B) +{ + if (A == B) { + return 0; + } else if (A < B) { + return 1; + } else { + return -1; + } +} + +int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->calls, (*B)->calls); +} + +int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration); +} + +int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration); +} + +#ifndef RPROF_ONLY_TOTAL_TIME + + #ifndef ARRAY_LEN + #define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0])) + #endif + + void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed) + { + assert(g_rprof.started); + assert(!g_rprof.finished); + + assert(slot_idx < RPROF_MAX_SLOTS); + assert(g_rprof.stack_size < RPROF_MAX_STACK-1); + + rprof_slot *slot = &g_rprof.slots[slot_idx]; + slot->label = label; + slot->calls++; + slot->bytes_processed += bytes_processed; + + g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration; + g_rprof.slot_stack[g_rprof.stack_size] = slot_idx; + g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer(); + g_rprof.stack_size++; + } + + void rprof_stop() + { + uint64_t now = rprof_read_cpu_timer(); + g_rprof.stack_size--; + uint64_t start = g_rprof.timer_stack[g_rprof.stack_size]; + size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size]; + size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size]; + + uint64_t duration = (now - start); + + if (g_rprof.stack_size > 0) { + size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1]; + g_rprof.slots[parent_slot].exclusive_duration -= duration; + } + + g_rprof.slots[slot_idx].exclusive_duration += duration; + g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration; + } + + void rprof_output(prof_sort_cmp_cb sort_cb) + { + assert(g_rprof.finished); + + uint64_t total_time = g_rprof.end_time - g_rprof.init_time; + uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + float total_time_secs = (float)total_time / cpu_hz; + + rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 }; + uint32_t slot_count = 0; + uint64_t profiled_duration = 0; + uint32_t label_width = 0; + + for (int i = 0; i < RPROF_MAX_SLOTS; i++) { + rprof_slot *slot = &g_rprof.slots[i]; + if (slot->label) { + slots[slot_count] = slot; + slot_count++; + label_width = MAX(label_width, strlen(slot->label)); + profiled_duration += slot->exclusive_duration; + } + } + + uint64_t other_duration = total_time - profiled_duration; + rprof_slot other_slot = { + .label = "", + .calls = 1, + .inclusive_duration = other_duration, + .exclusive_duration = other_duration + }; + + slots[slot_count++] = &other_slot; + + if (sort_cb) { + qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb); + } + + printf("\nTotal time taken: %.3fms (%lu) (CPU: ~%.3fGHz)\n", total_time_secs*1000, total_time, (float)cpu_hz/1000000000); + + uint32_t duration_max_width = 0; + uint32_t percent_max_width = 0; + char percent_column[RPROF_MAX_SLOTS+1][128]; + for (int i = 0; i < slot_count; i++) { + rprof_slot *slot = slots[i]; + + float percent = (float)slot->inclusive_duration*100/total_time; + float exclusive_percent = (float)slot->exclusive_duration*100/total_time; + uint32_t length; + if (slot->inclusive_duration == slot->exclusive_duration) { + length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent); + } else { + length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent); + } + percent_max_width = MAX(percent_max_width, length); + duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1); + } + + char line_format[128]; + snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]", label_width, duration_max_width, percent_max_width); + + for (int i = 0; i < slot_count; i++) { + rprof_slot *slot = slots[i]; + + printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); + if (slot->bytes_processed > 0) { + float time_spent = (float)slot->inclusive_duration / cpu_hz; + float megabytes = (float)slot->bytes_processed / (1024 * 1024); + printf(" %.3fmb", megabytes); + if (megabytes > 10) { + printf(" at %.3fgb/s", (megabytes / 1024) / time_spent); + } else { + printf(" at %.3fmb/s", megabytes / time_spent); + } + } + printf("\n"); + } + } + + static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots"); + +#else + + #define rprof_start(...) + #define rprof_stop(...) + + void rprof_output(prof_sort_cmp_cb sort_cb) + { + assert(g_rprof.finished); + + uint64_t total_time = g_rprof.end_time - g_rprof.init_time; + uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + + printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); + } + +#endif // RPROF_ONLY_TOTAL_TIME + +#endif // RPROF_IMPLEMENTATION + +#endif //RPROF_H