diff --git a/Makefile b/Makefile index 1109089..e115e23 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ build/gen_data: src/gen_data.c mkdir -p build gcc -o build/gen_data src/gen_data.c $(CFLAGS) -build/main: src/main.c src/json_parser.c src/profiler.c +build/main: src/main.c src/json_parser.c src/rprof.h mkdir -p build gcc -o build/main src/main.c $(CFLAGS) diff --git a/src/json_parser.c b/src/json_parser.c index 2534240..3d7085a 100644 --- a/src/json_parser.c +++ b/src/json_parser.c @@ -8,7 +8,7 @@ #include "harvensine_compute.h" #include "json_parser.h" -#include "profiler.c" +#include "rprof.h" void free_json_object(struct json_object *object) { @@ -76,14 +76,12 @@ static bool is_whitespace(char c) static size_t skip_ws(char **data, size_t *data_size) { - PROF_START("skip_ws()"); size_t ws_count = 0; while (data_size > 0) { if (!is_whitespace(**data)) break; inc_cursor(data, data_size, 1); ws_count++; } - PROF_STOP(); return ws_count; } @@ -151,7 +149,6 @@ static bool expect_number_char(char *data, size_t data_size) int parse_json_value(struct json_value *result, char *data, size_t data_size) { - PROF_START("parse_json_value()"); size_t bytes_parsed = 0; bytes_parsed += skip_ws(&data, &data_size); @@ -177,8 +174,6 @@ int parse_json_value(struct json_value *result, char *data, size_t data_size) assert(false && "todo"); } - PROF_STOP(); - return bytes_parsed; } @@ -274,7 +269,6 @@ int parse_json_array(struct json_array *result, char *data, size_t data_size) int parse_json_number(f64 *result, char *data, size_t data_size) { - PROF_START("parse_json_number()"); int bytes_parsed = 0; for (int i = 0; i < data_size; i++) { if (!is_number_char(data[i])) break; @@ -285,13 +279,11 @@ int parse_json_number(f64 *result, char *data, size_t data_size) // Use alternative parsing, so `data_size` can be enforced *result = strtod(data, NULL); - PROF_STOP(); return bytes_parsed; } int parse_json_string(struct json_string *result, char *data, size_t data_size) { - PROF_START("parse_json_string()"); assert(expect_char(data, data_size, '"')); u32 json_string_size = get_json_string_size(data, data_size); @@ -335,7 +327,6 @@ int parse_json_string(struct json_string *result, char *data, size_t data_size) result->str = realloc(result->str, result->size*sizeof(char)); - PROF_STOP(); return json_string_size; } diff --git a/src/main.c b/src/main.c index d9c6a09..403f75e 100644 --- a/src/main.c +++ b/src/main.c @@ -5,10 +5,10 @@ #include #include +#define RPROF_IMPLEMENTATION #include "harvensine_compute.h" -#include "timer.c" -#include "profiler.c" +#include "rprof.h" #include "json_parser.c" #include "harvensine_formula.c" @@ -91,7 +91,7 @@ static void free_point_pairs(struct point_pairs *pairs) int main(int argc, char **argv) { - prof_init(); + rprof_init(); if (argc <= 1) { print_usage(argv[0]); @@ -106,9 +106,9 @@ int main(int argc, char **argv) return -1; } - PROF_START("Read files"); + RPROF_START("Read files"); size_t json_size = get_file_size(f); - char json_data[json_size]; + char *json_data = malloc(json_size); size_t bytes_read = fread(json_data, 1, json_size, f); if (bytes_read != json_size) { printf("Failed to read all contents of file\n"); @@ -138,18 +138,18 @@ int main(int argc, char **argv) fclose(f); } - PROF_STOP(); + RPROF_STOP(); // Step 1. Read json file - PROF_START("Parse JSON"); + RPROF_START("Parse JSON"); struct json_value *parsed = NULL; { parsed = malloc(sizeof(struct json_value)); int bytes_parsed = parse_json_value(parsed, json_data, json_size); assert(bytes_parsed >= 0); } - PROF_STOP(); + RPROF_STOP(); u32 row_count = parsed->object->values[0]->array->count; if (reference_harvensines_count > 0) { @@ -158,35 +158,35 @@ int main(int argc, char **argv) // Step 2. Convert json object to struct - PROF_START("Convert to struct"); + RPROF_START("Convert to struct"); struct point_pairs *pairs = convert_to_struct(parsed); - PROF_STOP(); + RPROF_STOP(); // Step 3. Calculate harvensine distances - PROF_START("Compute harvensines"); + RPROF_START("Compute harvensines"); f64 *harvensines = malloc(pairs->count*sizeof(f64)); for (int i = 0; i < pairs->count; i++) { struct point_pair *p = &pairs->pairs[i]; harvensines[i] = get_harvensine_distance(p->x0, p->y0, p->x1, p->y1, EARTH_RADIUS); } - PROF_STOP(); + RPROF_STOP(); - PROF_START("Sum harvensines"); + RPROF_START("Sum harvensines"); f64 harvensine_sum = 0; for (int i = 0; i < pairs->count; i++) { harvensine_sum += harvensines[i]; } - PROF_STOP(); + RPROF_STOP(); - PROF_START("Free memory"); + RPROF_START("Free memory"); free(reference_harvensines); free(harvensines); free_json_value(parsed); free_point_pairs(pairs); - PROF_STOP(); + RPROF_STOP(); - prof_end(); + rprof_end(); // Output results @@ -197,7 +197,7 @@ int main(int argc, char **argv) printf("Diff of reference harvensine sum: %.16f\n", harvensine_sum - reference_harvensine_sum); } - prof_output(prof_cmp_by_exclusive_duration); + rprof_output(rprof_cmp_by_exclusive_duration); return 0; } diff --git a/src/profiler.c b/src/profiler.c deleted file mode 100644 index d582016..0000000 --- a/src/profiler.c +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef PROFILER_H_ -#define PROFILER_H_ - -#include -#include -#include -#include -#include - -#include "harvensine_compute.h" -#include "timer.c" - -#define PROFILER_MAX_STACK 128 -#define PROFILER_MAX_SLOTS 32 - -typedef struct { - char *label; - - u32 calls; - u64 inclusive_duration; - u64 exclusive_duration; -} prof_slot; - -typedef struct { - bool started; - bool finished; - - u64 init_time; - u64 end_time; - - size_t slot_stack[PROFILER_MAX_STACK]; - u64 duration_stack[PROFILER_MAX_STACK]; - u64 timer_stack[PROFILER_MAX_STACK]; - u32 stack_size; - - prof_slot slots[PROFILER_MAX_SLOTS]; -} prof; - -typedef int prof_sort_cmp_cb(const prof_slot **A, const prof_slot **B); - -static prof g_prof = { 0 }; - -void prof_init() -{ - assert(!g_prof.started); - g_prof.init_time = read_cpu_timer(); - g_prof.started = true; -} - -void prof_end() -{ - assert(!g_prof.finished); - g_prof.end_time = read_cpu_timer(); - g_prof.finished = true; -} - -void prof_start(size_t slot_idx, char *label) -{ - assert(g_prof.started); - assert(!g_prof.finished); - - assert(slot_idx < PROFILER_MAX_SLOTS); - assert(g_prof.stack_size < PROFILER_MAX_STACK-1); - - prof_slot *slot = &g_prof.slots[slot_idx]; - slot->label = label; - slot->calls++; - - g_prof.duration_stack[g_prof.stack_size] = slot->inclusive_duration; - g_prof.slot_stack[g_prof.stack_size] = slot_idx; - g_prof.timer_stack[g_prof.stack_size] = read_cpu_timer(); - g_prof.stack_size++; -} - -void prof_stop() -{ - u64 now = read_cpu_timer(); - g_prof.stack_size--; - u64 start = g_prof.timer_stack[g_prof.stack_size]; - size_t slot_idx = g_prof.slot_stack[g_prof.stack_size]; - size_t inclusive_duration = g_prof.duration_stack[g_prof.stack_size]; - - u64 duration = (now - start); - - if (g_prof.stack_size > 0) { - size_t parent_slot = g_prof.slot_stack[g_prof.stack_size-1]; - g_prof.slots[parent_slot].exclusive_duration -= duration; - } - - g_prof.slots[slot_idx].exclusive_duration += duration; - g_prof.slots[slot_idx].inclusive_duration = inclusive_duration + duration; -} - -static int prof_cmp_u32(u32 A, u32 B) -{ - if (A == B) { - return 0; - } else if (A < B) { - return 1; - } else { - return -1; - } -} - -int prof_cmp_by_calls(const prof_slot **A, const prof_slot **B) -{ - return prof_cmp_u32((*A)->calls, (*B)->calls); -} - -int prof_cmp_by_exclusive_duration(const prof_slot **A, const prof_slot **B) -{ - return prof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration); -} - -int prof_cmp_by_inclusive_duration(const prof_slot **A, const prof_slot **B) -{ - return prof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration); -} - -void prof_output(prof_sort_cmp_cb sort_cb) -{ - assert(g_prof.finished); - - u64 total_time = g_prof.end_time - g_prof.init_time; - u64 cpu_hz = get_cpu_timer_hz(100); - - prof_slot *slots[PROFILER_MAX_SLOTS+1] = { 0 }; - u32 slot_count = 0; - u64 profiled_duration = 0; - u32 label_width = 0; - - for (int i = 0; i < PROFILER_MAX_SLOTS; i++) { - prof_slot *slot = &g_prof.slots[i]; - if (slot->label) { - slots[slot_count] = slot; - slot_count++; - label_width = MAX(label_width, strlen(slot->label)); - profiled_duration += slot->exclusive_duration; - } - } - - u64 other_duration = total_time - profiled_duration; - prof_slot other_slot = { - .label = "", - .calls = 1, - .inclusive_duration = other_duration, - .exclusive_duration = other_duration - }; - - slots[slot_count++] = &other_slot; - - if (sort_cb) { - qsort(slots, slot_count, sizeof(prof_slot*), (void*)sort_cb); - } - - printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); - - u32 percent_max_width = 0; - char percent_column[PROFILER_MAX_SLOTS+1][128]; - for (int i = 0; i < slot_count; i++) { - prof_slot *slot = slots[i]; - - f32 percent = (f32)slot->inclusive_duration*100/total_time; - f32 exclusive_percent = (f32)slot->exclusive_duration*100/total_time; - u32 length; - if (slot->inclusive_duration == slot->exclusive_duration) { - length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent); - } else { - length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent); - } - percent_max_width = MAX(percent_max_width, length); - } - - char line_format[128]; - snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%9lu %%-%ds [%%d]\n", label_width, percent_max_width); - - for (int i = 0; i < slot_count; i++) { - prof_slot *slot = slots[i]; - - printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); - } -} - -#define PROF_START(label) prof_start(__COUNTER__, label) -#define PROF_STOP() prof_stop() -static_assert(__COUNTER__ < PROFILER_MAX_SLOTS, "__COUNTER__ reached max profiler slots"); - -#endif //PROFILER_H_ diff --git a/src/rprof.h b/src/rprof.h new file mode 100644 index 0000000..c852dd1 --- /dev/null +++ b/src/rprof.h @@ -0,0 +1,312 @@ +#ifndef RPROF_H +#define RPROF_H + +// Available defines for configuration: +// RPROF_IMPLEMENTATION: +// Enable implementation of library +// +// RPROF_MAX_STACK (default 128): +// To record nested blocks rprof uses a stack, this defines the maximum size of that stack +// +// RPROF_MAX_SLOTS (default 32): +// When using `rprof_start()`, you need to specify into which slot the timing will be saved. +// This defines how many slots you have available. +// +// RPROF_ONLY_TOTAL_TIME: +// Don't time block marked between `rprof_start()` and `rprof_end()`. +// Useful for checking the overhead added by the profiler. + +#include +#include +#include +#include +#include +#include +#include + +#ifndef RPROF_MAX_STACK + #define RPROF_MAX_STACK 128 +#endif + +#ifndef RPROF_MAX_SLOTS + #define RPROF_MAX_SLOTS 32 +#endif + +typedef struct { + char *label; + + uint32_t calls; + uint64_t inclusive_duration; + uint64_t exclusive_duration; +} rprof_slot; + +typedef struct { + bool started; + bool finished; + + uint64_t init_time; + uint64_t end_time; + + uint32_t stack_size; + size_t slot_stack[RPROF_MAX_STACK]; + uint64_t duration_stack[RPROF_MAX_STACK]; + uint64_t timer_stack[RPROF_MAX_STACK]; + + rprof_slot slots[RPROF_MAX_SLOTS]; +} rprof; + +typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B); + +static rprof g_rprof = { 0 }; + +void rprof_init(); +void rprof_end(); +void rprof_start(size_t slot_idx, char *label); +void rprof_stop(); + +int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B); +int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B); +int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B); +void rprof_output(prof_sort_cmp_cb sort_cb); + +#define RPROF_START(label) rprof_start(__COUNTER__, label) +#define RPROF_STOP() rprof_stop() + +#ifdef RPROF_IMPLEMENTATION + +// ------------------------ CPU Timing ------------------------- + +#ifdef WIN32 + #include + #include + + static uint64_t rprof_get_os_timer_hz(void) + { + LARGE_INTEGER Freq; + QueryPerformanceFrequency(&Freq); + return Freq.QuadPart; + } + + static uint64_t rprof_read_os_timer(void) + { + LARGE_INTEGER Value; + QueryPerformanceCounter(&Value); + return Value.QuadPart; + } +#else + #include + #include + + static uint64_t rprof_get_os_timer_hz(void) + { + return 1000000000; + } + + static uint64_t rprof_read_os_timer(void) + { + struct timespec time; + clock_gettime(CLOCK_MONOTONIC_RAW, &time); + return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec; + } +#endif // WIN32 + + +static uint64_t rprof_read_cpu_timer(void) +{ + return __rdtsc(); +} + +static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms) +{ + uint64_t os_freq = rprof_get_os_timer_hz(); + uint64_t os_start = rprof_read_os_timer(); + uint64_t os_elapsed = 0; + + uint64_t cpu_start = rprof_read_cpu_timer(); + + uint64_t wait_duration = os_freq * measure_time_ms / 1000; + while (os_elapsed < wait_duration) { + os_elapsed = rprof_read_os_timer() - os_start; + } + + uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start; + if (os_elapsed) { + return os_freq * cpu_elapsed / os_elapsed; + } else { + return 0; + } +} + +// ------------------------ Profiling ------------------------- + +void rprof_init() +{ + assert(!g_rprof.started); + g_rprof.init_time = rprof_read_cpu_timer(); + g_rprof.started = true; +} + +void rprof_end() +{ + assert(!g_rprof.finished); + g_rprof.end_time = rprof_read_cpu_timer(); + g_rprof.finished = true; +} + +static int rprof_cmp_u32(uint32_t A, uint32_t B) +{ + if (A == B) { + return 0; + } else if (A < B) { + return 1; + } else { + return -1; + } +} + +int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->calls, (*B)->calls); +} + +int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration); +} + +int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) +{ + return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration); +} + +#ifndef RPROF_ONLY_TOTAL_TIME + + #ifndef ARRAY_LEN + #define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0])) + #endif + + void rprof_start(size_t slot_idx, char *label) + { + assert(g_rprof.started); + assert(!g_rprof.finished); + + assert(slot_idx < RPROF_MAX_SLOTS); + assert(g_rprof.stack_size < RPROF_MAX_STACK-1); + + rprof_slot *slot = &g_rprof.slots[slot_idx]; + slot->label = label; + slot->calls++; + + g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration; + g_rprof.slot_stack[g_rprof.stack_size] = slot_idx; + g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer(); + g_rprof.stack_size++; + } + + void rprof_stop() + { + uint64_t now = rprof_read_cpu_timer(); + g_rprof.stack_size--; + uint64_t start = g_rprof.timer_stack[g_rprof.stack_size]; + size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size]; + size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size]; + + uint64_t duration = (now - start); + + if (g_rprof.stack_size > 0) { + size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1]; + g_rprof.slots[parent_slot].exclusive_duration -= duration; + } + + g_rprof.slots[slot_idx].exclusive_duration += duration; + g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration; + } + + void rprof_output(prof_sort_cmp_cb sort_cb) + { + assert(g_rprof.finished); + + uint64_t total_time = g_rprof.end_time - g_rprof.init_time; + uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + + rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 }; + uint32_t slot_count = 0; + uint64_t profiled_duration = 0; + uint32_t label_width = 0; + + for (int i = 0; i < RPROF_MAX_SLOTS; i++) { + rprof_slot *slot = &g_rprof.slots[i]; + if (slot->label) { + slots[slot_count] = slot; + slot_count++; + label_width = MAX(label_width, strlen(slot->label)); + profiled_duration += slot->exclusive_duration; + } + } + + uint64_t other_duration = total_time - profiled_duration; + rprof_slot other_slot = { + .label = "", + .calls = 1, + .inclusive_duration = other_duration, + .exclusive_duration = other_duration + }; + + slots[slot_count++] = &other_slot; + + if (sort_cb) { + qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb); + } + + printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); + + uint32_t duration_max_width = 0; + uint32_t percent_max_width = 0; + char percent_column[RPROF_MAX_SLOTS+1][128]; + for (int i = 0; i < slot_count; i++) { + rprof_slot *slot = slots[i]; + + float percent = (float)slot->inclusive_duration*100/total_time; + float exclusive_percent = (float)slot->exclusive_duration*100/total_time; + uint32_t length; + if (slot->inclusive_duration == slot->exclusive_duration) { + length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent); + } else { + length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent); + } + percent_max_width = MAX(percent_max_width, length); + duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1); + } + + char line_format[128]; + snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]\n", label_width, duration_max_width, percent_max_width); + + for (int i = 0; i < slot_count; i++) { + rprof_slot *slot = slots[i]; + + printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); + } + } + + static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots"); + +#else + + #define rprof_start(...) + #define rprof_stop(...) + + void rprof_output(prof_sort_cmp_cb sort_cb) + { + assert(g_rprof.finished); + + uint64_t total_time = g_rprof.end_time - g_rprof.init_time; + uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + + printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); + } + +#endif // RPROF_ONLY_TOTAL_TIME + +#endif // RPROF_IMPLEMENTATION + +#endif //RPROF_H diff --git a/src/timer.c b/src/timer.c deleted file mode 100644 index 6ebb254..0000000 --- a/src/timer.c +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef TIMER_ -#define TIMER_ - -#include "harvensine_compute.h" - -#ifdef WIN32 - -#include -#include - -static u64 get_os_timer_hz(void) -{ - LARGE_INTEGER Freq; - QueryPerformanceFrequency(&Freq); - return Freq.QuadPart; -} - -static u64 read_os_timer(void) -{ - LARGE_INTEGER Value; - QueryPerformanceCounter(&Value); - return Value.QuadPart; -} - -#else - -#include -#include - -static u64 get_os_timer_hz(void) -{ - return 1000000000; -} - -static u64 read_os_timer(void) -{ - struct timespec time; - clock_gettime(CLOCK_MONOTONIC_RAW, &time); - return get_os_timer_hz()*time.tv_sec + time.tv_nsec; -} - -#endif // WIN32 - - -static u64 read_cpu_timer(void) -{ - return __rdtsc(); -} - -static u64 get_cpu_timer_hz(u64 measure_time_ms) -{ - u64 os_freq = get_os_timer_hz(); - u64 os_start = read_os_timer(); - u64 os_elapsed = 0; - - u64 cpu_start = read_cpu_timer(); - - u64 wait_duration = os_freq * measure_time_ms / 1000; - while (os_elapsed < wait_duration) { - os_elapsed = read_os_timer() - os_start; - } - - u64 cpu_elapsed = read_cpu_timer() - cpu_start; - if (os_elapsed) { - return os_freq * cpu_elapsed / os_elapsed; - } else { - return 0; - } -} - -#endif //TIMER_