initial commit
This commit is contained in:
commit
4720f092ae
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
build
|
14
Makefile
Normal file
14
Makefile
Normal file
@ -0,0 +1,14 @@
|
||||
CFLAGS=-lm -g -Wall -O0
|
||||
|
||||
build/main: src/main.c src/repetition_tester.c build/nop_loop.o src/rprof.h
|
||||
mkdir -p build
|
||||
gcc -o build/main src/main.c build/nop_loop.o $(CFLAGS)
|
||||
|
||||
build/nop_loop.o: src/nop_loop.asm
|
||||
nasm -g -f elf64 -o build/nop_loop.o src/nop_loop.asm
|
||||
|
||||
run: ./build/main
|
||||
./build/main $(TEST_NAME)
|
||||
|
||||
clean:
|
||||
rm -r build
|
35
src/main.c
Normal file
35
src/main.c
Normal file
@ -0,0 +1,35 @@
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "repetition_tester.c"
|
||||
|
||||
#define RPROF_IMPLEMENTATION
|
||||
#include "rprof.h"
|
||||
|
||||
#include "main_read_file.c"
|
||||
#include "main_write_bytes.c"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
printf("Usage: %s <test-name>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *test_name = argv[1];
|
||||
|
||||
if (!strncmp(test_name, "write_bytes", sizeof("write_bytes"))) {
|
||||
return main_test_write_bytes();
|
||||
} else if (!strncmp(test_name, "read_file", sizeof("read_file"))) {
|
||||
if (argc < 3) {
|
||||
printf("Usage: %s read_file <filename>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return main_test_read_file(argv[2]);
|
||||
} else {
|
||||
printf("ERROR: Unknown test case '%s'\n", test_name);
|
||||
return -1;
|
||||
}
|
||||
}
|
85
src/main_read_file.c
Normal file
85
src/main_read_file.c
Normal file
@ -0,0 +1,85 @@
|
||||
#include <sys/stat.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "repetition_tester.c"
|
||||
|
||||
uint64_t get_file_size(const char *filename) {
|
||||
struct stat result;
|
||||
stat(filename, &result);
|
||||
return result.st_size;
|
||||
}
|
||||
|
||||
void read_file_with_fread(uint8_t *buffer, uint64_t buffer_size, char *filename) {
|
||||
FILE *file = fopen(filename, "r");
|
||||
|
||||
uint64_t read_amount = 0;
|
||||
while (read_amount < buffer_size) {
|
||||
int result = fread(&buffer[read_amount], 1, buffer_size - read_amount, file);
|
||||
|
||||
if (result <= 0) {
|
||||
printf("ERROR: Failed to read from file\n");
|
||||
break;
|
||||
}
|
||||
|
||||
read_amount += result;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
void read_file_with_read(uint8_t *buffer, uint64_t buffer_size, char *filename) {
|
||||
int file = open(filename, O_RDONLY);
|
||||
|
||||
uint64_t read_amount = 0;
|
||||
while (read_amount < buffer_size) {
|
||||
int result = read(file, &buffer[read_amount], buffer_size - read_amount);
|
||||
|
||||
if (result <= 0) {
|
||||
printf("ERROR: Failed to read from file\n");
|
||||
break;
|
||||
}
|
||||
|
||||
read_amount += result;
|
||||
}
|
||||
|
||||
close(file);
|
||||
}
|
||||
|
||||
int main_test_read_file(char *filename) {
|
||||
typedef void (*read_file_b)(uint8_t *buffer, uint64_t buffer_size, char *filename);
|
||||
struct testcase {
|
||||
char *name;
|
||||
read_file_b cb;
|
||||
};
|
||||
|
||||
struct testcase cases[] = {
|
||||
{ .name = "read()", .cb = read_file_with_read },
|
||||
{ .name = "fread()", .cb = read_file_with_fread },
|
||||
};
|
||||
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
||||
|
||||
uint64_t file_size = get_file_size(filename);
|
||||
uint8_t *buffer = malloc(sizeof(uint8_t) * file_size);
|
||||
|
||||
while (repeat_forever(&repetitor)) {
|
||||
for (int i = 0; i < ARRAY_LEN(cases); i++) {
|
||||
struct testcase *testcase = &cases[i];
|
||||
repetitor_clear(&repetitor);
|
||||
|
||||
while (repetitor_repeat(&repetitor, 1)) {
|
||||
repetitor_start(&repetitor);
|
||||
testcase->cb(buffer, file_size, filename);
|
||||
repetitor_stop(&repetitor, file_size);
|
||||
}
|
||||
|
||||
repetitor_print_results_label(&repetitor, testcase->name);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
38
src/main_write_bytes.c
Normal file
38
src/main_write_bytes.c
Normal file
@ -0,0 +1,38 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "nop_loop.h"
|
||||
|
||||
int main_test_write_bytes() {
|
||||
typedef void (*write_bytes_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
||||
write_bytes_cb cb;
|
||||
};
|
||||
|
||||
struct testcase cases[] = {
|
||||
{ .name = "write bytes 3x1", .cb = writes_bytes_nop_3x1_asm },
|
||||
{ .name = "write bytes 1x3", .cb = writes_bytes_nop_1x3_asm },
|
||||
{ .name = "write bytes 1x9", .cb = writes_bytes_nop_1x9_asm },
|
||||
};
|
||||
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
||||
|
||||
uint64_t byte_count = 4096 * 512;
|
||||
uint8_t *buffer = malloc(sizeof(uint8_t) * byte_count);
|
||||
|
||||
for (int i = 0; i < ARRAY_LEN(cases); i++) {
|
||||
struct testcase *testcase = &cases[i];
|
||||
repetitor_clear(&repetitor);
|
||||
|
||||
while (repetitor_repeat(&repetitor, 2)) {
|
||||
repetitor_start(&repetitor);
|
||||
testcase->cb(buffer, byte_count);
|
||||
repetitor_stop(&repetitor, byte_count);
|
||||
}
|
||||
|
||||
repetitor_print_results_label(&repetitor, testcase->name);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
80
src/nop_loop.asm
Normal file
80
src/nop_loop.asm
Normal file
@ -0,0 +1,80 @@
|
||||
global writes_bytes_mov_asm
|
||||
global writes_bytes_nop_3x1_asm
|
||||
global writes_bytes_nop_1x3_asm
|
||||
global writes_bytes_nop_1x9_asm
|
||||
global writes_bytes_cmp_asm
|
||||
global writes_bytes_dec_asm
|
||||
|
||||
section .text
|
||||
|
||||
; rsi - byte_count
|
||||
; rdi - buffer
|
||||
writes_bytes_mov_asm:
|
||||
xor rax, rax
|
||||
.loop:
|
||||
mov [rdi+rax], al
|
||||
inc rax
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
ret
|
||||
|
||||
; rsi - byte_count
|
||||
; rdi - buffer
|
||||
writes_bytes_nop_3x1_asm:
|
||||
xor rax, rax
|
||||
.loop:
|
||||
db 0x0f, 0x1f, 0x00
|
||||
inc rax
|
||||
cmp rsi,rax
|
||||
jne .loop
|
||||
ret
|
||||
|
||||
; rsi - byte_count
|
||||
; rdi - buffer
|
||||
writes_bytes_nop_1x3_asm:
|
||||
xor rax, rax
|
||||
.loop:
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
inc rax
|
||||
cmp rsi,rax
|
||||
jne .loop
|
||||
ret
|
||||
|
||||
; rsi - byte_count
|
||||
; rdi - buffer
|
||||
writes_bytes_nop_1x9_asm:
|
||||
xor rax, rax
|
||||
.loop:
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
inc rax
|
||||
cmp rsi,rax
|
||||
jne .loop
|
||||
ret
|
||||
|
||||
; rsi - byte_count
|
||||
; rdi - buffer
|
||||
writes_bytes_cmp_asm:
|
||||
xor rax, rax
|
||||
.loop:
|
||||
inc rax
|
||||
cmp rsi,rax
|
||||
jne .loop
|
||||
ret
|
||||
|
||||
; rsi - byte_count
|
||||
; rdi - buffer
|
||||
writes_bytes_dec_asm:
|
||||
.loop:
|
||||
dec rsi
|
||||
jnz .loop
|
||||
ret
|
8
src/nop_loop.h
Normal file
8
src/nop_loop.h
Normal file
@ -0,0 +1,8 @@
|
||||
#include <stdint.h>
|
||||
|
||||
void writes_bytes_mov_asm(uint8_t *buffer, uint64_t byte_count);
|
||||
void writes_bytes_nop_3x1_asm(uint8_t *buffer, uint64_t byte_count);
|
||||
void writes_bytes_nop_1x3_asm(uint8_t *buffer, uint64_t byte_count);
|
||||
void writes_bytes_nop_1x9_asm(uint8_t *buffer, uint64_t byte_count);
|
||||
void writes_bytes_cmp_asm(uint8_t *buffer, uint64_t byte_count);
|
||||
void writes_bytes_dec_asm(uint8_t *buffer, uint64_t byte_count);
|
182
src/repetition_tester.c
Normal file
182
src/repetition_tester.c
Normal file
@ -0,0 +1,182 @@
|
||||
#ifndef REPETITION_TESTER_
|
||||
#define REPETITION_TESTER_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "rprof.h"
|
||||
|
||||
struct repetitor {
|
||||
uint64_t cpu_freq;
|
||||
|
||||
uint64_t repeat_cycle;
|
||||
|
||||
uint64_t repetition_count;
|
||||
uint64_t time_taken;
|
||||
uint64_t page_faults;
|
||||
|
||||
uint64_t total_time_taken;
|
||||
uint64_t total_page_faults;
|
||||
uint64_t total_byte_count;
|
||||
|
||||
uint64_t min_time_taken;
|
||||
uint64_t min_page_faults;
|
||||
uint64_t min_byte_count;
|
||||
|
||||
uint64_t max_time_taken;
|
||||
uint64_t max_page_faults;
|
||||
uint64_t max_byte_count;
|
||||
|
||||
uint64_t last_min_time_taken;
|
||||
uint64_t last_found_min_time;
|
||||
};
|
||||
|
||||
static uint64_t min_uint64(uint64_t a, uint64_t b) {
|
||||
return a > b ? b : a;
|
||||
}
|
||||
|
||||
static uint64_t max_uint64(uint64_t a, uint64_t b) {
|
||||
return a < b ? b : a;
|
||||
}
|
||||
|
||||
void repetitor_clear(struct repetitor *repetitor) {
|
||||
repetitor->repetition_count = 0;
|
||||
|
||||
repetitor->time_taken = 0;
|
||||
repetitor->page_faults = 0;
|
||||
|
||||
repetitor->total_page_faults = 0;
|
||||
repetitor->total_time_taken = 0;
|
||||
repetitor->total_byte_count = 0;
|
||||
|
||||
repetitor->min_page_faults = 0;
|
||||
repetitor->min_time_taken = 0;
|
||||
repetitor->min_byte_count = 0;
|
||||
|
||||
repetitor->max_page_faults = 0;
|
||||
repetitor->max_time_taken = 0;
|
||||
repetitor->max_byte_count = 0;
|
||||
|
||||
repetitor->last_found_min_time = 0;
|
||||
}
|
||||
|
||||
void repetitor_init(struct repetitor *repetitor) {
|
||||
repetitor->cpu_freq = rprof_get_cpu_timer_hz(100);
|
||||
repetitor->repeat_cycle = 0;
|
||||
repetitor_clear(repetitor);
|
||||
}
|
||||
|
||||
void repetitor_start(struct repetitor *repetitor) {
|
||||
repetitor->page_faults -= read_page_faults();
|
||||
repetitor->time_taken -= rprof_read_cpu_timer();
|
||||
}
|
||||
|
||||
void repetitor_stop(struct repetitor *repetitor, uint64_t byte_count) {
|
||||
repetitor->time_taken += rprof_read_cpu_timer();
|
||||
repetitor->page_faults += read_page_faults();
|
||||
|
||||
if (repetitor->repetition_count == 0) {
|
||||
repetitor->max_time_taken = repetitor->time_taken;
|
||||
repetitor->max_page_faults = repetitor->page_faults;
|
||||
repetitor->max_byte_count = byte_count;
|
||||
repetitor->min_time_taken = repetitor->time_taken;
|
||||
repetitor->min_page_faults = repetitor->page_faults;
|
||||
repetitor->min_byte_count = byte_count;
|
||||
} else {
|
||||
repetitor->max_time_taken = max_uint64(repetitor->max_time_taken, repetitor->time_taken);
|
||||
repetitor->max_page_faults = max_uint64(repetitor->max_page_faults, repetitor->page_faults);
|
||||
repetitor->max_byte_count = max_uint64(repetitor->max_byte_count, byte_count);
|
||||
repetitor->min_time_taken = min_uint64(repetitor->min_time_taken, repetitor->time_taken);
|
||||
repetitor->min_page_faults = min_uint64(repetitor->min_page_faults, repetitor->page_faults);
|
||||
repetitor->min_byte_count = min_uint64(repetitor->min_byte_count, byte_count);
|
||||
}
|
||||
|
||||
repetitor->total_time_taken += repetitor->time_taken;
|
||||
repetitor->total_page_faults += repetitor->page_faults;
|
||||
repetitor->total_byte_count += byte_count;
|
||||
repetitor->repetition_count += 1;
|
||||
|
||||
repetitor->time_taken = 0;
|
||||
repetitor->page_faults = 0;
|
||||
}
|
||||
|
||||
bool repetitor_repeat(struct repetitor *repetitor, float timeout_seconds) {
|
||||
if (repetitor->repetition_count == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t now = rprof_read_cpu_timer();
|
||||
|
||||
if (repetitor->last_found_min_time != 0) {
|
||||
uint64_t time_since_last_find = (now - repetitor->last_found_min_time);
|
||||
uint64_t timeout_cycles = timeout_seconds * repetitor->cpu_freq;
|
||||
if (time_since_last_find > timeout_cycles) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (repetitor->min_time_taken < repetitor->last_min_time_taken || repetitor->last_found_min_time == 0) {
|
||||
repetitor->last_min_time_taken = repetitor->min_time_taken;
|
||||
repetitor->last_found_min_time = now;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool repeat_forever(struct repetitor *repetitor) {
|
||||
repetitor->repeat_cycle += 1;
|
||||
printf("======= Cycle %ld ================================\n", repetitor->repeat_cycle);
|
||||
return true;
|
||||
}
|
||||
|
||||
static float cycles_to_s(struct repetitor *repetitor, uint64_t cycles) {
|
||||
return (float)cycles / repetitor->cpu_freq;
|
||||
}
|
||||
|
||||
static float cycles_to_ms(struct repetitor *repetitor, uint64_t cycles) {
|
||||
return cycles_to_s(repetitor, cycles) * 1000;
|
||||
}
|
||||
|
||||
static float bytes_to_kb(uint64_t bytes) {
|
||||
return (float)bytes / 1000;
|
||||
}
|
||||
|
||||
static float bytes_to_gb_s(struct repetitor *repetitor, uint64_t bytes, uint64_t cycles) {
|
||||
return bytes_to_kb(bytes) / (1000*1000) / cycles_to_s(repetitor, cycles);
|
||||
}
|
||||
|
||||
static float divide_safe(float a, float b) {
|
||||
return b != 0 ? a / b : 0;
|
||||
}
|
||||
|
||||
void repetitor_print_results(struct repetitor *repetitor) {
|
||||
uint64_t avg_time_taken = repetitor->total_time_taken/repetitor->repetition_count;
|
||||
uint64_t min_time_taken = repetitor->min_time_taken;
|
||||
uint64_t max_time_taken = repetitor->max_time_taken;
|
||||
|
||||
uint64_t avg_byte_count = repetitor->total_byte_count/repetitor->repetition_count;
|
||||
uint64_t min_byte_count = repetitor->min_byte_count;
|
||||
uint64_t max_byte_count = repetitor->max_byte_count;
|
||||
|
||||
float avg_bandwidth = bytes_to_gb_s(repetitor, avg_byte_count, avg_time_taken);
|
||||
float min_bandwidth = bytes_to_gb_s(repetitor, min_byte_count, min_time_taken);
|
||||
float max_bandwidth = bytes_to_gb_s(repetitor, max_byte_count, max_time_taken);
|
||||
|
||||
float avg_page_faults = (float)repetitor->total_page_faults/repetitor->repetition_count;
|
||||
uint64_t min_page_faults = repetitor->min_page_faults;
|
||||
uint64_t max_page_faults = repetitor->max_page_faults;
|
||||
|
||||
float avg_kb_per_fault = divide_safe(bytes_to_kb(avg_byte_count), avg_page_faults);
|
||||
float min_kb_per_fault = divide_safe(bytes_to_kb(repetitor->min_page_faults), min_page_faults);
|
||||
float max_kb_per_fault = divide_safe(bytes_to_kb(repetitor->max_page_faults), max_page_faults);
|
||||
|
||||
printf(" : %16s : %10s : %12s : %18s : %12s : %18s :\n", "Time (cycles)", "Time (ms)", "Memory (B)", "Bandwidth (GiB/s)", "Page faults", "KiB per page fault");
|
||||
printf("Best : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", min_time_taken, cycles_to_ms(repetitor, min_time_taken), min_byte_count, min_bandwidth, min_page_faults, min_kb_per_fault);
|
||||
printf("Average : %16ld : %10.6f : %12ld : %18.6f : %12.6f : %18.6f :\n", avg_time_taken, cycles_to_ms(repetitor, avg_time_taken), avg_byte_count, avg_bandwidth, avg_page_faults, avg_kb_per_fault);
|
||||
printf("Worst : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", max_time_taken, cycles_to_ms(repetitor, max_time_taken), max_byte_count, max_bandwidth, max_page_faults, max_kb_per_fault);
|
||||
}
|
||||
|
||||
void repetitor_print_results_label(struct repetitor *repetitor, char *label) {
|
||||
printf("--------- %s ---------\n", label);
|
||||
repetitor_print_results(repetitor);
|
||||
}
|
||||
|
||||
#endif //REPETITION_TESTER_
|
344
src/rprof.h
Normal file
344
src/rprof.h
Normal file
@ -0,0 +1,344 @@
|
||||
#ifndef RPROF_H
|
||||
#define RPROF_H
|
||||
|
||||
// TODO: Maybe remove `assert()`, to lower overhead? Put them behind a macro?
|
||||
|
||||
// Available defines for configuration:
|
||||
// RPROF_IMPLEMENTATION:
|
||||
// Enable implementation of library
|
||||
//
|
||||
// RPROF_MAX_STACK (default 128):
|
||||
// To record nested blocks rprof uses a stack, this defines the maximum size of that stack
|
||||
//
|
||||
// RPROF_MAX_SLOTS (default 32):
|
||||
// When using `rprof_start()`, you need to specify into which slot the timing will be saved.
|
||||
// This defines how many slots you have available.
|
||||
//
|
||||
// RPROF_ONLY_TOTAL_TIME:
|
||||
// Don't time block marked between `rprof_start()` and `rprof_end()`.
|
||||
// Useful for checking the overhead added by the profiler.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <sys/param.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifndef RPROF_MAX_STACK
|
||||
#define RPROF_MAX_STACK 128
|
||||
#endif
|
||||
|
||||
#ifndef RPROF_MAX_SLOTS
|
||||
#define RPROF_MAX_SLOTS 32
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
const char *label;
|
||||
|
||||
uint32_t calls;
|
||||
uint64_t inclusive_duration;
|
||||
uint64_t exclusive_duration;
|
||||
uint64_t bytes_processed;
|
||||
} rprof_slot;
|
||||
|
||||
typedef struct {
|
||||
bool started;
|
||||
bool finished;
|
||||
|
||||
uint64_t init_time;
|
||||
uint64_t end_time;
|
||||
|
||||
uint32_t stack_size;
|
||||
size_t slot_stack[RPROF_MAX_STACK];
|
||||
uint64_t duration_stack[RPROF_MAX_STACK];
|
||||
uint64_t timer_stack[RPROF_MAX_STACK];
|
||||
|
||||
rprof_slot slots[RPROF_MAX_SLOTS];
|
||||
} rprof;
|
||||
|
||||
typedef int prof_sort_cmp_cb(const rprof_slot **A, const rprof_slot **B);
|
||||
|
||||
static rprof g_rprof = { 0 };
|
||||
|
||||
void rprof_init();
|
||||
void rprof_end();
|
||||
void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed);
|
||||
void rprof_stop();
|
||||
|
||||
int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B);
|
||||
int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B);
|
||||
int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B);
|
||||
void rprof_output(prof_sort_cmp_cb sort_cb);
|
||||
|
||||
#define RPROF_START(label) rprof_start(__COUNTER__, label, 0)
|
||||
#define RPROF_START_BYTES(label, bytes) rprof_start(__COUNTER__, label, bytes)
|
||||
#define RPROF_STOP() rprof_stop()
|
||||
|
||||
#define RPROF_IMPLEMENTATION
|
||||
#ifdef RPROF_IMPLEMENTATION
|
||||
|
||||
// ------------------------ CPU Timing -------------------------
|
||||
|
||||
#ifdef WIN32
|
||||
#include <intrin.h>
|
||||
#include <windows.h>
|
||||
|
||||
static uint64_t rprof_get_os_timer_hz(void)
|
||||
{
|
||||
LARGE_INTEGER Freq;
|
||||
QueryPerformanceFrequency(&Freq);
|
||||
return Freq.QuadPart;
|
||||
}
|
||||
|
||||
static uint64_t rprof_read_os_timer(void)
|
||||
{
|
||||
LARGE_INTEGER Value;
|
||||
QueryPerformanceCounter(&Value);
|
||||
return Value.QuadPart;
|
||||
}
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#include <time.h>
|
||||
|
||||
static uint64_t rprof_get_os_timer_hz(void)
|
||||
{
|
||||
return 1000000000;
|
||||
}
|
||||
|
||||
static uint64_t rprof_read_os_timer(void)
|
||||
{
|
||||
struct timespec time;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &time);
|
||||
return rprof_get_os_timer_hz()*time.tv_sec + time.tv_nsec;
|
||||
}
|
||||
#endif // WIN32
|
||||
|
||||
|
||||
static uint64_t rprof_read_cpu_timer(void)
|
||||
{
|
||||
return __rdtsc();
|
||||
}
|
||||
|
||||
static uint64_t rprof_get_cpu_timer_hz(uint64_t measure_time_ms)
|
||||
{
|
||||
uint64_t os_freq = rprof_get_os_timer_hz();
|
||||
uint64_t os_start = rprof_read_os_timer();
|
||||
uint64_t os_elapsed = 0;
|
||||
|
||||
uint64_t cpu_start = rprof_read_cpu_timer();
|
||||
|
||||
uint64_t wait_duration = os_freq * measure_time_ms / 1000;
|
||||
while (os_elapsed < wait_duration) {
|
||||
os_elapsed = rprof_read_os_timer() - os_start;
|
||||
}
|
||||
|
||||
uint64_t cpu_elapsed = rprof_read_cpu_timer() - cpu_start;
|
||||
if (os_elapsed) {
|
||||
return os_freq * cpu_elapsed / os_elapsed;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------ Page faults -------------------------
|
||||
|
||||
#include <sys/resource.h>
|
||||
|
||||
uint64_t read_page_faults() {
|
||||
// NOTE: ru_minflt the number of page faults serviced without any I/O activity.
|
||||
// ru_majflt the number of page faults serviced that required I/O activity.
|
||||
struct rusage usage = {};
|
||||
getrusage(RUSAGE_SELF, &usage);
|
||||
int Result = usage.ru_minflt + usage.ru_majflt;
|
||||
return Result;
|
||||
}
|
||||
|
||||
// ------------------------ Profiling -------------------------
|
||||
|
||||
void rprof_init()
|
||||
{
|
||||
assert(!g_rprof.started);
|
||||
g_rprof.init_time = rprof_read_cpu_timer();
|
||||
g_rprof.started = true;
|
||||
}
|
||||
|
||||
void rprof_end()
|
||||
{
|
||||
assert(!g_rprof.finished);
|
||||
g_rprof.end_time = rprof_read_cpu_timer();
|
||||
g_rprof.finished = true;
|
||||
}
|
||||
|
||||
static int rprof_cmp_u32(uint32_t A, uint32_t B)
|
||||
{
|
||||
if (A == B) {
|
||||
return 0;
|
||||
} else if (A < B) {
|
||||
return 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int rprof_cmp_by_calls(const rprof_slot **A, const rprof_slot **B)
|
||||
{
|
||||
return rprof_cmp_u32((*A)->calls, (*B)->calls);
|
||||
}
|
||||
|
||||
int rprof_cmp_by_exclusive_duration(const rprof_slot **A, const rprof_slot **B)
|
||||
{
|
||||
return rprof_cmp_u32((*A)->exclusive_duration, (*B)->exclusive_duration);
|
||||
}
|
||||
|
||||
int rprof_cmp_by_inclusive_duration(const rprof_slot **A, const rprof_slot **B)
|
||||
{
|
||||
return rprof_cmp_u32((*A)->inclusive_duration, (*B)->inclusive_duration);
|
||||
}
|
||||
|
||||
#ifndef RPROF_ONLY_TOTAL_TIME
|
||||
|
||||
#ifndef ARRAY_LEN
|
||||
#define ARRAY_LEN(x) (sizeof(x)/sizeof(x[0]))
|
||||
#endif
|
||||
|
||||
void rprof_start(size_t slot_idx, char *label, uint64_t bytes_processed)
|
||||
{
|
||||
assert(g_rprof.started);
|
||||
assert(!g_rprof.finished);
|
||||
|
||||
assert(slot_idx < RPROF_MAX_SLOTS);
|
||||
assert(g_rprof.stack_size < RPROF_MAX_STACK-1);
|
||||
|
||||
rprof_slot *slot = &g_rprof.slots[slot_idx];
|
||||
slot->label = label;
|
||||
slot->calls++;
|
||||
slot->bytes_processed += bytes_processed;
|
||||
|
||||
g_rprof.duration_stack[g_rprof.stack_size] = slot->inclusive_duration;
|
||||
g_rprof.slot_stack[g_rprof.stack_size] = slot_idx;
|
||||
g_rprof.timer_stack[g_rprof.stack_size] = rprof_read_cpu_timer();
|
||||
g_rprof.stack_size++;
|
||||
}
|
||||
|
||||
void rprof_stop()
|
||||
{
|
||||
uint64_t now = rprof_read_cpu_timer();
|
||||
g_rprof.stack_size--;
|
||||
uint64_t start = g_rprof.timer_stack[g_rprof.stack_size];
|
||||
size_t slot_idx = g_rprof.slot_stack[g_rprof.stack_size];
|
||||
size_t inclusive_duration = g_rprof.duration_stack[g_rprof.stack_size];
|
||||
|
||||
uint64_t duration = (now - start);
|
||||
|
||||
if (g_rprof.stack_size > 0) {
|
||||
size_t parent_slot = g_rprof.slot_stack[g_rprof.stack_size-1];
|
||||
g_rprof.slots[parent_slot].exclusive_duration -= duration;
|
||||
}
|
||||
|
||||
g_rprof.slots[slot_idx].exclusive_duration += duration;
|
||||
g_rprof.slots[slot_idx].inclusive_duration = inclusive_duration + duration;
|
||||
}
|
||||
|
||||
void rprof_output(prof_sort_cmp_cb sort_cb)
|
||||
{
|
||||
assert(g_rprof.finished);
|
||||
|
||||
uint64_t total_time = g_rprof.end_time - g_rprof.init_time;
|
||||
uint64_t cpu_hz = rprof_get_cpu_timer_hz(100);
|
||||
float total_time_secs = (float)total_time / cpu_hz;
|
||||
|
||||
rprof_slot *slots[RPROF_MAX_SLOTS+1] = { 0 };
|
||||
uint32_t slot_count = 0;
|
||||
uint64_t profiled_duration = 0;
|
||||
uint32_t label_width = 0;
|
||||
|
||||
for (int i = 0; i < RPROF_MAX_SLOTS; i++) {
|
||||
rprof_slot *slot = &g_rprof.slots[i];
|
||||
if (slot->label) {
|
||||
slots[slot_count] = slot;
|
||||
slot_count++;
|
||||
label_width = MAX(label_width, strlen(slot->label));
|
||||
profiled_duration += slot->exclusive_duration;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t other_duration = total_time - profiled_duration;
|
||||
rprof_slot other_slot = {
|
||||
.label = "<other>",
|
||||
.calls = 1,
|
||||
.inclusive_duration = other_duration,
|
||||
.exclusive_duration = other_duration
|
||||
};
|
||||
|
||||
slots[slot_count++] = &other_slot;
|
||||
|
||||
if (sort_cb) {
|
||||
qsort(slots, slot_count, sizeof(rprof_slot*), (void*)sort_cb);
|
||||
}
|
||||
|
||||
printf("\nTotal time taken: %.3fms (%lu) (CPU: ~%.3fGHz)\n", total_time_secs*1000, total_time, (float)cpu_hz/1000000000);
|
||||
|
||||
uint32_t duration_max_width = 0;
|
||||
uint32_t percent_max_width = 0;
|
||||
char percent_column[RPROF_MAX_SLOTS+1][128];
|
||||
for (int i = 0; i < slot_count; i++) {
|
||||
rprof_slot *slot = slots[i];
|
||||
|
||||
float percent = (float)slot->inclusive_duration*100/total_time;
|
||||
float exclusive_percent = (float)slot->exclusive_duration*100/total_time;
|
||||
uint32_t length;
|
||||
if (slot->inclusive_duration == slot->exclusive_duration) {
|
||||
length = snprintf(percent_column[i], 128, "(%6.3f%%)", exclusive_percent);
|
||||
} else {
|
||||
length = snprintf(percent_column[i], 128, "(%6.3f%%, %6.3f%% w/children)", exclusive_percent, percent);
|
||||
}
|
||||
percent_max_width = MAX(percent_max_width, length);
|
||||
duration_max_width = MAX(duration_max_width, (int)log10(slot->inclusive_duration) + 1);
|
||||
}
|
||||
|
||||
char line_format[128];
|
||||
snprintf(line_format, ARRAY_LEN(line_format), " %%%ds - %%%dlu %%-%ds [%%d]", label_width, duration_max_width, percent_max_width);
|
||||
|
||||
for (int i = 0; i < slot_count; i++) {
|
||||
rprof_slot *slot = slots[i];
|
||||
|
||||
printf(line_format, slot->label, slot->inclusive_duration, percent_column[i], slot->calls);
|
||||
if (slot->bytes_processed > 0) {
|
||||
float time_spent = (float)slot->inclusive_duration / cpu_hz;
|
||||
float megabytes = (float)slot->bytes_processed / (1024 * 1024);
|
||||
printf(" %.3fmb", megabytes);
|
||||
if (megabytes > 10) {
|
||||
printf(" at %.3fgb/s", (megabytes / 1024) / time_spent);
|
||||
} else {
|
||||
printf(" at %.3fmb/s", megabytes / time_spent);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
static_assert(__COUNTER__ < RPROF_MAX_SLOTS, "__COUNTER__ reached max profiler slots");
|
||||
|
||||
#else
|
||||
|
||||
#define rprof_start(...)
|
||||
#define rprof_stop(...)
|
||||
|
||||
void rprof_output(prof_sort_cmp_cb sort_cb)
|
||||
{
|
||||
assert(g_rprof.finished);
|
||||
|
||||
uint64_t total_time = g_rprof.end_time - g_rprof.init_time;
|
||||
uint64_t cpu_hz = rprof_get_cpu_timer_hz(100);
|
||||
|
||||
printf("\nTotal time taken: %.3fms (%lu)\n", (float)total_time*1000/cpu_hz, total_time);
|
||||
}
|
||||
|
||||
#endif // RPROF_ONLY_TOTAL_TIME
|
||||
|
||||
#endif // RPROF_IMPLEMENTATION
|
||||
|
||||
#endif //RPROF_H
|
Loading…
Reference in New Issue
Block a user