1
0

initial commit

This commit is contained in:
Rokas Puzonas 2024-02-15 18:23:18 +02:00
commit 4720f092ae
9 changed files with 787 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
build

14
Makefile Normal file
View File

@ -0,0 +1,14 @@
CFLAGS=-lm -g -Wall -O0
build/main: src/main.c src/repetition_tester.c build/nop_loop.o src/rprof.h
mkdir -p build
gcc -o build/main src/main.c build/nop_loop.o $(CFLAGS)
build/nop_loop.o: src/nop_loop.asm
nasm -g -f elf64 -o build/nop_loop.o src/nop_loop.asm
run: ./build/main
./build/main $(TEST_NAME)
clean:
rm -r build

35
src/main.c Normal file
View File

@ -0,0 +1,35 @@
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/stat.h>
#include "repetition_tester.c"
#define RPROF_IMPLEMENTATION
#include "rprof.h"
#include "main_read_file.c"
#include "main_write_bytes.c"
int main(int argc, char **argv) {
if (argc < 2) {
printf("Usage: %s <test-name>\n", argv[0]);
return -1;
}
char *test_name = argv[1];
if (!strncmp(test_name, "write_bytes", sizeof("write_bytes"))) {
return main_test_write_bytes();
} else if (!strncmp(test_name, "read_file", sizeof("read_file"))) {
if (argc < 3) {
printf("Usage: %s read_file <filename>\n", argv[0]);
return -1;
}
return main_test_read_file(argv[2]);
} else {
printf("ERROR: Unknown test case '%s'\n", test_name);
return -1;
}
}

85
src/main_read_file.c Normal file
View File

@ -0,0 +1,85 @@
#include <sys/stat.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include "repetition_tester.c"
uint64_t get_file_size(const char *filename) {
struct stat result;
stat(filename, &result);
return result.st_size;
}
void read_file_with_fread(uint8_t *buffer, uint64_t buffer_size, char *filename) {
FILE *file = fopen(filename, "r");
uint64_t read_amount = 0;
while (read_amount < buffer_size) {
int result = fread(&buffer[read_amount], 1, buffer_size - read_amount, file);
if (result <= 0) {
printf("ERROR: Failed to read from file\n");
break;
}
read_amount += result;
}
fclose(file);
}
void read_file_with_read(uint8_t *buffer, uint64_t buffer_size, char *filename) {
int file = open(filename, O_RDONLY);
uint64_t read_amount = 0;
while (read_amount < buffer_size) {
int result = read(file, &buffer[read_amount], buffer_size - read_amount);
if (result <= 0) {
printf("ERROR: Failed to read from file\n");
break;
}
read_amount += result;
}
close(file);
}
int main_test_read_file(char *filename) {
typedef void (*read_file_b)(uint8_t *buffer, uint64_t buffer_size, char *filename);
struct testcase {
char *name;
read_file_b cb;
};
struct testcase cases[] = {
{ .name = "read()", .cb = read_file_with_read },
{ .name = "fread()", .cb = read_file_with_fread },
};
struct repetitor repetitor = {};
repetitor_init(&repetitor);
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
uint64_t file_size = get_file_size(filename);
uint8_t *buffer = malloc(sizeof(uint8_t) * file_size);
while (repeat_forever(&repetitor)) {
for (int i = 0; i < ARRAY_LEN(cases); i++) {
struct testcase *testcase = &cases[i];
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 1)) {
repetitor_start(&repetitor);
testcase->cb(buffer, file_size, filename);
repetitor_stop(&repetitor, file_size);
}
repetitor_print_results_label(&repetitor, testcase->name);
}
}
return 0;
}

38
src/main_write_bytes.c Normal file
View File

@ -0,0 +1,38 @@
#include "repetition_tester.c"
#include "nop_loop.h"
int main_test_write_bytes() {
typedef void (*write_bytes_cb)(uint8_t *buffer, uint64_t byte_count);
struct testcase {
char *name;
write_bytes_cb cb;
};
struct testcase cases[] = {
{ .name = "write bytes 3x1", .cb = writes_bytes_nop_3x1_asm },
{ .name = "write bytes 1x3", .cb = writes_bytes_nop_1x3_asm },
{ .name = "write bytes 1x9", .cb = writes_bytes_nop_1x9_asm },
};
struct repetitor repetitor = {};
repetitor_init(&repetitor);
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
uint64_t byte_count = 4096 * 512;
uint8_t *buffer = malloc(sizeof(uint8_t) * byte_count);
for (int i = 0; i < ARRAY_LEN(cases); i++) {
struct testcase *testcase = &cases[i];
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 2)) {
repetitor_start(&repetitor);
testcase->cb(buffer, byte_count);
repetitor_stop(&repetitor, byte_count);
}
repetitor_print_results_label(&repetitor, testcase->name);
}
return 0;
}

80
src/nop_loop.asm Normal file
View File

@ -0,0 +1,80 @@
global writes_bytes_mov_asm
global writes_bytes_nop_3x1_asm
global writes_bytes_nop_1x3_asm
global writes_bytes_nop_1x9_asm
global writes_bytes_cmp_asm
global writes_bytes_dec_asm
section .text
; rsi - byte_count
; rdi - buffer
writes_bytes_mov_asm:
xor rax, rax
.loop:
mov [rdi+rax], al
inc rax
cmp rax, rsi
jb .loop
ret
; rsi - byte_count
; rdi - buffer
writes_bytes_nop_3x1_asm:
xor rax, rax
.loop:
db 0x0f, 0x1f, 0x00
inc rax
cmp rsi,rax
jne .loop
ret
; rsi - byte_count
; rdi - buffer
writes_bytes_nop_1x3_asm:
xor rax, rax
.loop:
nop
nop
nop
inc rax
cmp rsi,rax
jne .loop
ret
; rsi - byte_count
; rdi - buffer
writes_bytes_nop_1x9_asm:
xor rax, rax
.loop:
nop
nop
nop
nop
nop
nop
nop
nop
nop
inc rax
cmp rsi,rax
jne .loop
ret
; rsi - byte_count
; rdi - buffer
writes_bytes_cmp_asm:
xor rax, rax
.loop:
inc rax
cmp rsi,rax
jne .loop
ret
; rsi - byte_count
; rdi - buffer
writes_bytes_dec_asm:
.loop:
dec rsi
jnz .loop
ret

8
src/nop_loop.h Normal file
View File

@ -0,0 +1,8 @@
#include <stdint.h>
void writes_bytes_mov_asm(uint8_t *buffer, uint64_t byte_count);
void writes_bytes_nop_3x1_asm(uint8_t *buffer, uint64_t byte_count);
void writes_bytes_nop_1x3_asm(uint8_t *buffer, uint64_t byte_count);
void writes_bytes_nop_1x9_asm(uint8_t *buffer, uint64_t byte_count);
void writes_bytes_cmp_asm(uint8_t *buffer, uint64_t byte_count);
void writes_bytes_dec_asm(uint8_t *buffer, uint64_t byte_count);

182
src/repetition_tester.c Normal file
View File

@ -0,0 +1,182 @@
#ifndef REPETITION_TESTER_
#define REPETITION_TESTER_
#include <stdint.h>
#include "rprof.h"
struct repetitor {
uint64_t cpu_freq;
uint64_t repeat_cycle;
uint64_t repetition_count;
uint64_t time_taken;
uint64_t page_faults;
uint64_t total_time_taken;
uint64_t total_page_faults;
uint64_t total_byte_count;
uint64_t min_time_taken;
uint64_t min_page_faults;
uint64_t min_byte_count;
uint64_t max_time_taken;
uint64_t max_page_faults;
uint64_t max_byte_count;
uint64_t last_min_time_taken;
uint64_t last_found_min_time;
};
static uint64_t min_uint64(uint64_t a, uint64_t b) {
return a > b ? b : a;
}
static uint64_t max_uint64(uint64_t a, uint64_t b) {
return a < b ? b : a;
}
void repetitor_clear(struct repetitor *repetitor) {
repetitor->repetition_count = 0;
repetitor->time_taken = 0;
repetitor->page_faults = 0;
repetitor->total_page_faults = 0;
repetitor->total_time_taken = 0;
repetitor->total_byte_count = 0;
repetitor->min_page_faults = 0;
repetitor->min_time_taken = 0;
repetitor->min_byte_count = 0;
repetitor->max_page_faults = 0;
repetitor->max_time_taken = 0;
repetitor->max_byte_count = 0;
repetitor->last_found_min_time = 0;
}
void repetitor_init(struct repetitor *repetitor) {
repetitor->cpu_freq = rprof_get_cpu_timer_hz(100);
repetitor->repeat_cycle = 0;
repetitor_clear(repetitor);
}
void repetitor_start(struct repetitor *repetitor) {
repetitor->page_faults -= read_page_faults();
repetitor->time_taken -= rprof_read_cpu_timer();
}
void repetitor_stop(struct repetitor *repetitor, uint64_t byte_count) {
repetitor->time_taken += rprof_read_cpu_timer();
repetitor->page_faults += read_page_faults();
if (repetitor->repetition_count == 0) {
repetitor->max_time_taken = repetitor->time_taken;
repetitor->max_page_faults = repetitor->page_faults;
repetitor->max_byte_count = byte_count;
repetitor->min_time_taken = repetitor->time_taken;
repetitor->min_page_faults = repetitor->page_faults;
repetitor->min_byte_count = byte_count;
} else {
repetitor->max_time_taken = max_uint64(repetitor->max_time_taken, repetitor->time_taken);
repetitor->max_page_faults = max_uint64(repetitor->max_page_faults, repetitor->page_faults);
repetitor->max_byte_count = max_uint64(repetitor->max_byte_count, byte_count);
repetitor->min_time_taken = min_uint64(repetitor->min_time_taken, repetitor->time_taken);
repetitor->min_page_faults = min_uint64(repetitor->min_page_faults, repetitor->page_faults);
repetitor->min_byte_count = min_uint64(repetitor->min_byte_count, byte_count);
}
repetitor->total_time_taken += repetitor->time_taken;
repetitor->total_page_faults += repetitor->page_faults;
repetitor->total_byte_count += byte_count;
repetitor->repetition_count += 1;
repetitor->time_taken = 0;
repetitor->page_faults = 0;
}
bool repetitor_repeat(struct repetitor *repetitor, float timeout_seconds) {
if (repetitor->repetition_count == 0) {
return true;
}
uint64_t now = rprof_read_cpu_timer();
if (repetitor->last_found_min_time != 0) {
uint64_t time_since_last_find = (now - repetitor->last_found_min_time);
uint64_t timeout_cycles = timeout_seconds * repetitor->cpu_freq;
if (time_since_last_find > timeout_cycles) {
return false;
}
}
if (repetitor->min_time_taken < repetitor->last_min_time_taken || repetitor->last_found_min_time == 0) {
repetitor->last_min_time_taken = repetitor->min_time_taken;
repetitor->last_found_min_time = now;
}
return true;
}
bool repeat_forever(struct repetitor *repetitor) {
repetitor->repeat_cycle += 1;
printf("======= Cycle %ld ================================\n", repetitor->repeat_cycle);
return true;
}
static float cycles_to_s(struct repetitor *repetitor, uint64_t cycles) {
return (float)cycles / repetitor->cpu_freq;
}
static float cycles_to_ms(struct repetitor *repetitor, uint64_t cycles) {
return cycles_to_s(repetitor, cycles) * 1000;
}
static float bytes_to_kb(uint64_t bytes) {
return (float)bytes / 1000;
}
static float bytes_to_gb_s(struct repetitor *repetitor, uint64_t bytes, uint64_t cycles) {
return bytes_to_kb(bytes) / (1000*1000) / cycles_to_s(repetitor, cycles);
}
static float divide_safe(float a, float b) {
return b != 0 ? a / b : 0;
}
void repetitor_print_results(struct repetitor *repetitor) {
uint64_t avg_time_taken = repetitor->total_time_taken/repetitor->repetition_count;
uint64_t min_time_taken = repetitor->min_time_taken;
uint64_t max_time_taken = repetitor->max_time_taken;
uint64_t avg_byte_count = repetitor->total_byte_count/repetitor->repetition_count;
uint64_t min_byte_count = repetitor->min_byte_count;
uint64_t max_byte_count = repetitor->max_byte_count;
float avg_bandwidth = bytes_to_gb_s(repetitor, avg_byte_count, avg_time_taken);
float min_bandwidth = bytes_to_gb_s(repetitor, min_byte_count, min_time_taken);
float max_bandwidth = bytes_to_gb_s(repetitor, max_byte_count, max_time_taken);
float avg_page_faults = (float)repetitor->total_page_faults/repetitor->repetition_count;
uint64_t min_page_faults = repetitor->min_page_faults;
uint64_t max_page_faults = repetitor->max_page_faults;
float avg_kb_per_fault = divide_safe(bytes_to_kb(avg_byte_count), avg_page_faults);
float min_kb_per_fault = divide_safe(bytes_to_kb(repetitor->min_page_faults), min_page_faults);
float max_kb_per_fault = divide_safe(bytes_to_kb(repetitor->max_page_faults), max_page_faults);
printf(" : %16s : %10s : %12s : %18s : %12s : %18s :\n", "Time (cycles)", "Time (ms)", "Memory (B)", "Bandwidth (GiB/s)", "Page faults", "KiB per page fault");
printf("Best : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", min_time_taken, cycles_to_ms(repetitor, min_time_taken), min_byte_count, min_bandwidth, min_page_faults, min_kb_per_fault);
printf("Average : %16ld : %10.6f : %12ld : %18.6f : %12.6f : %18.6f :\n", avg_time_taken, cycles_to_ms(repetitor, avg_time_taken), avg_byte_count, avg_bandwidth, avg_page_faults, avg_kb_per_fault);
printf("Worst : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", max_time_taken, cycles_to_ms(repetitor, max_time_taken), max_byte_count, max_bandwidth, max_page_faults, max_kb_per_fault);
}
void repetitor_print_results_label(struct repetitor *repetitor, char *label) {
printf("--------- %s ---------\n", label);
repetitor_print_results(repetitor);
}
#endif //REPETITION_TESTER_

344
src/rprof.h Normal file
View File

@ -0,0 +1,344 @@
#ifndef RPROF_H
#define RPROF_H
// TODO: Maybe remove `assert()`, to lower overhead? Put them behind a macro?
// Available defines for configuration:
// RPROF_IMPLEMENTATION:
// Enable implementation of library
//
// RPROF_MAX_STACK (default 128):
// To record nested blocks rprof uses a stack, this defines the maximum size of that stack
//
// RPROF_MAX_SLOTS (default 32):
// When using `rprof_start()`, you need to specify into which slot the timing will be saved.
// This defines how many slots you have available.
//
// RPROF_ONLY_TOTAL_TIME:
// Don't time block marked between `rprof_start()` and `rprof_end()`.
// Useful for checking the overhead added by the profiler.
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdbool.h>
#include <assert.h>
#include <string.h>
#include <sys/param.h>
#include <stdlib.h>
#ifndef RPROF_MAX_STACK
#define RPROF_MAX_STACK 128
#endif
#ifndef RPROF_MAX_SLOTS
#define RPROF_MAX_SLOTS 32
#endif
typedef struct {
const char *label;
uint32_t calls;
uint64_t inclusive_duration;
uint64_t exclusive_duration;
uint64_t bytes_processed;
} rprof_slot;
typedef struct {
bool started;
bool finished;
uint64_t init_time;
uint64_t end_time;
uint32_t stack_size;
size_t slot_stack[RPROF_MAX_STACK];
uint64_t duration_stack[RPROF_MAX_STACK];
uint64_t timer_stack[RPROF_MAX_STACK];
rprof_slot slots[RPROF_MAX_SLOTS];
} rprof;
typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B);
static rprof g_rprof = { 0 };
void rprof_init();
void rprof_end();
void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed);
void rprof_stop();
int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B);
int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B);
int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B);
void rprof_output(prof_sort_cmp_cb sort_cb);
#define RPROF_START(label) rprof_start(__COUNTER__, label, 0)
#define RPROF_START_BYTES(label, bytes) rprof_start(__COUNTER__, label, bytes)
#define RPROF_STOP() rprof_stop()
#define RPROF_IMPLEMENTATION
#ifdef RPROF_IMPLEMENTATION
// ------------------------ CPU Timing -------------------------
#ifdef WIN32
#include <intrin.h>
#include <windows.h>
static uint64_t rprof_get_os_timer_hz(void)
{
LARGE_INTEGER Freq;
QueryPerformanceFrequency(&Freq);
return Freq.QuadPart;
}
static uint64_t rprof_read_os_timer(void)
{
LARGE_INTEGER Value;
QueryPerformanceCounter(&Value);
return Value.QuadPart;
}
#else
#include <x86intrin.h>
#include <time.h>
static uint64_t rprof_get_os_timer_hz(void)
{
return 1000000000;
}
static uint64_t rprof_read_os_timer(void)
{
struct timespec time;
clock_gettime(CLOCK_MONOTONIC_RAW, &time);
return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec;
}
#endif // WIN32
static uint64_t rprof_read_cpu_timer(void)
{
return __rdtsc();
}
static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms)
{
uint64_t os_freq = rprof_get_os_timer_hz();
uint64_t os_start = rprof_read_os_timer();
uint64_t os_elapsed = 0;
uint64_t cpu_start = rprof_read_cpu_timer();
uint64_t wait_duration = os_freq * measure_time_ms / 1000;
while (os_elapsed < wait_duration) {
os_elapsed = rprof_read_os_timer() - os_start;
}
uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start;
if (os_elapsed) {
return os_freq * cpu_elapsed / os_elapsed;
} else {
return 0;
}
}
// ------------------------ Page faults -------------------------
#include <sys/resource.h>
uint64_t read_page_faults() {
// NOTE: ru_minflt the number of page faults serviced without any I/O activity.
// ru_majflt the number of page faults serviced that required I/O activity.
struct rusage usage = {};
getrusage(RUSAGE_SELF, &usage);
int Result = usage.ru_minflt + usage.ru_majflt;
return Result;
}
// ------------------------ Profiling -------------------------
void rprof_init()
{
assert(!g_rprof.started);
g_rprof.init_time = rprof_read_cpu_timer();
g_rprof.started = true;
}
void rprof_end()
{
assert(!g_rprof.finished);
g_rprof.end_time = rprof_read_cpu_timer();
g_rprof.finished = true;
}
static int rprof_cmp_u32(uint32_t A, uint32_t B)
{
if (A == B) {
return 0;
} else if (A < B) {
return 1;
} else {
return -1;
}
}
int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B)
{
return rprof_cmp_u32((*A)->calls, (*B)->calls);
}
int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B)
{
return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration);
}
int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B)
{
return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration);
}
#ifndef RPROF_ONLY_TOTAL_TIME
#ifndef ARRAY_LEN
#define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0]))
#endif
void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed)
{
assert(g_rprof.started);
assert(!g_rprof.finished);
assert(slot_idx < RPROF_MAX_SLOTS);
assert(g_rprof.stack_size < RPROF_MAX_STACK-1);
rprof_slot *slot = &g_rprof.slots[slot_idx];
slot->label = label;
slot->calls++;
slot->bytes_processed += bytes_processed;
g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration;
g_rprof.slot_stack[g_rprof.stack_size] = slot_idx;
g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer();
g_rprof.stack_size++;
}
void rprof_stop()
{
uint64_t now = rprof_read_cpu_timer();
g_rprof.stack_size--;
uint64_t start = g_rprof.timer_stack[g_rprof.stack_size];
size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size];
size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size];
uint64_t duration = (now - start);
if (g_rprof.stack_size > 0) {
size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1];
g_rprof.slots[parent_slot].exclusive_duration -= duration;
}
g_rprof.slots[slot_idx].exclusive_duration += duration;
g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration;
}
void rprof_output(prof_sort_cmp_cb sort_cb)
{
assert(g_rprof.finished);
uint64_t total_time = g_rprof.end_time - g_rprof.init_time;
uint64_t cpu_hz = rprof_get_cpu_timer_hz(100);
float total_time_secs = (float)total_time / cpu_hz;
rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 };
uint32_t slot_count = 0;
uint64_t profiled_duration = 0;
uint32_t label_width = 0;
for (int i = 0; i < RPROF_MAX_SLOTS; i++) {
rprof_slot *slot = &g_rprof.slots[i];
if (slot->label) {
slots[slot_count] = slot;
slot_count++;
label_width = MAX(label_width, strlen(slot->label));
profiled_duration += slot->exclusive_duration;
}
}
uint64_t other_duration = total_time - profiled_duration;
rprof_slot other_slot = {
.label = "<other>",
.calls = 1,
.inclusive_duration = other_duration,
.exclusive_duration = other_duration
};
slots[slot_count++] = &other_slot;
if (sort_cb) {
qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb);
}
printf("\nTotal time taken: %.3fms (%lu) (CPU: ~%.3fGHz)\n", total_time_secs*1000, total_time, (float)cpu_hz/1000000000);
uint32_t duration_max_width = 0;
uint32_t percent_max_width = 0;
char percent_column[RPROF_MAX_SLOTS+1][128];
for (int i = 0; i < slot_count; i++) {
rprof_slot *slot = slots[i];
float percent = (float)slot->inclusive_duration*100/total_time;
float exclusive_percent = (float)slot->exclusive_duration*100/total_time;
uint32_t length;
if (slot->inclusive_duration == slot->exclusive_duration) {
length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent);
} else {
length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent);
}
percent_max_width = MAX(percent_max_width, length);
duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1);
}
char line_format[128];
snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]", label_width, duration_max_width, percent_max_width);
for (int i = 0; i < slot_count; i++) {
rprof_slot *slot = slots[i];
printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls);
if (slot->bytes_processed > 0) {
float time_spent = (float)slot->inclusive_duration / cpu_hz;
float megabytes = (float)slot->bytes_processed / (1024 * 1024);
printf(" %.3fmb", megabytes);
if (megabytes > 10) {
printf(" at %.3fgb/s", (megabytes / 1024) / time_spent);
} else {
printf(" at %.3fmb/s", megabytes / time_spent);
}
}
printf("\n");
}
}
static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots");
#else
#define rprof_start(...)
#define rprof_stop(...)
void rprof_output(prof_sort_cmp_cb sort_cb)
{
assert(g_rprof.finished);
uint64_t total_time = g_rprof.end_time - g_rprof.init_time;
uint64_t cpu_hz = rprof_get_cpu_timer_hz(100);
printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time);
}
#endif // RPROF_ONLY_TOTAL_TIME
#endif // RPROF_IMPLEMENTATION
#endif //RPROF_H