diff --git a/src/main.c b/src/main.c index 403f75e..05c817c 100644 --- a/src/main.c +++ b/src/main.c @@ -106,8 +106,8 @@ int main(int argc, char **argv) return -1; } - RPROF_START("Read files"); size_t json_size = get_file_size(f); + RPROF_START_BYTES("Read JSON file", json_size); char *json_data = malloc(json_size); size_t bytes_read = fread(json_data, 1, json_size, f); if (bytes_read != json_size) { @@ -115,12 +115,14 @@ int main(int argc, char **argv) return -1; } fclose(f); + RPROF_STOP(); f64 *reference_harvensines = NULL; size_t reference_harvensines_count = 0; f64 reference_harvensine_sum = 0; if (argc >= 3) { + RPROF_START("Read answer file"); char *answers_filename = argv[2]; FILE *f = fopen(answers_filename, "r"); if (f == NULL) { @@ -137,12 +139,12 @@ int main(int argc, char **argv) fread(&reference_harvensine_sum, sizeof(f64), 1, f); fclose(f); + RPROF_STOP(); } - RPROF_STOP(); // Step 1. Read json file - RPROF_START("Parse JSON"); + RPROF_START_BYTES("Parse JSON", json_size); struct json_value *parsed = NULL; { parsed = malloc(sizeof(struct json_value)); @@ -164,7 +166,7 @@ int main(int argc, char **argv) // Step 3. Calculate harvensine distances - RPROF_START("Compute harvensines"); + RPROF_START_BYTES("Compute harvensines", sizeof(struct point_pair)*pairs->count); f64 *harvensines = malloc(pairs->count*sizeof(f64)); for (int i = 0; i < pairs->count; i++) { struct point_pair *p = &pairs->pairs[i]; @@ -172,19 +174,24 @@ int main(int argc, char **argv) } RPROF_STOP(); - RPROF_START("Sum harvensines"); + RPROF_START_BYTES("Sum harvensines", sizeof(f64)*pairs->count); f64 harvensine_sum = 0; for (int i = 0; i < pairs->count; i++) { harvensine_sum += harvensines[i]; } RPROF_STOP(); - RPROF_START("Free memory"); + RPROF_START_BYTES("Free struct memory", sizeof(f64)*pairs->count + sizeof(struct point_pair)*pairs->count); free(reference_harvensines); free(harvensines); + RPROF_STOP(); + RPROF_START("Free json memory"); free_json_value(parsed); free_point_pairs(pairs); RPROF_STOP(); + RPROF_START_BYTES("Free json file", json_size); + free(json_data); + RPROF_STOP(); rprof_end(); diff --git a/src/rprof.h b/src/rprof.h index 5234621..8bb5e02 100644 --- a/src/rprof.h +++ b/src/rprof.h @@ -40,6 +40,7 @@ typedef struct { uint32_t calls; uint64_t inclusive_duration; uint64_t exclusive_duration; + uint64_t bytes_processed; } rprof_slot; typedef struct { @@ -63,7 +64,7 @@ static rprof g_rprof = { 0 }; void rprof_init(); void rprof_end(); -void rprof_start(size_t slot_idx, char *label); +void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed); void rprof_stop(); int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B); @@ -71,7 +72,8 @@ int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B); int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B); void rprof_output(prof_sort_cmp_cb sort_cb); -#define RPROF_START(label) rprof_start(__COUNTER__, label) +#define RPROF_START(label) rprof_start(__COUNTER__, label, 0) +#define RPROF_START_BYTES(label, bytes) rprof_start(__COUNTER__, label, bytes) #define RPROF_STOP() rprof_stop() #ifdef RPROF_IMPLEMENTATION @@ -187,7 +189,7 @@ int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) #define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0])) #endif - void rprof_start(size_t slot_idx, char *label) + void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed) { assert(g_rprof.started); assert(!g_rprof.finished); @@ -198,6 +200,7 @@ int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) rprof_slot *slot = &g_rprof.slots[slot_idx]; slot->label = label; slot->calls++; + slot->bytes_processed += bytes_processed; g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration; g_rprof.slot_stack[g_rprof.stack_size] = slot_idx; @@ -230,6 +233,7 @@ int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) uint64_t total_time = g_rprof.end_time - g_rprof.init_time; uint64_t cpu_hz = rprof_get_cpu_timer_hz(100); + float total_time_secs = (float)total_time / cpu_hz; rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 }; uint32_t slot_count = 0; @@ -260,7 +264,7 @@ int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb); } - printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time); + printf("\nTotal time taken: %.3fms (%lu) (CPU: ~%.3fGHz)\n", total_time_secs*1000, total_time, (float)cpu_hz/1000000000); uint32_t duration_max_width = 0; uint32_t percent_max_width = 0; @@ -281,12 +285,22 @@ int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B) } char line_format[128]; - snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]\n", label_width, duration_max_width, percent_max_width); + snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]", label_width, duration_max_width, percent_max_width); for (int i = 0; i < slot_count; i++) { rprof_slot *slot = slots[i]; printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls); + if (slot->bytes_processed > 0) { + float time_spent = (float)slot->inclusive_duration / cpu_hz; + float megabytes = (float)slot->bytes_processed / (1024 * 1024); + if (megabytes > 10) { + printf(" at %.3fgb/s", (megabytes / 1024) / time_spent); + } else { + printf(" at %.3fmb/s", megabytes / time_spent); + } + } + printf("\n"); } }