diff --git a/Makefile b/Makefile index feacc98..c2f2412 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ CFLAGS=-lm -g -Wall -O1 -build/main: src/main.c src/repetition_tester.c build/multi_nop_loop.o build/write_loops.o build/load_uop.o build/store_uop.o build/short_load_uop.o src/rprof.h +# TODO: Move this to a build.zig. This is becoming a mess, I don't want to deal with Makefiles anymore + +build/main: src/main.c src/repetition_tester.c build/multi_nop_loop.o build/write_loops.o build/load_uop.o build/store_uop.o build/short_load_uop.o src/rprof.h build/read_widths.o mkdir -p build - gcc -o build/main src/main.c build/multi_nop_loop.o build/write_loops.o build/load_uop.o build/store_uop.o build/short_load_uop.o $(CFLAGS) + gcc -o build/main src/main.c build/multi_nop_loop.o build/write_loops.o build/load_uop.o build/store_uop.o build/short_load_uop.o build/read_widths.o $(CFLAGS) build/short_load_uop.o: src/short_load_uop.asm nasm -g -f elf64 -o build/short_load_uop.o src/short_load_uop.asm @@ -19,6 +21,9 @@ build/multi_nop_loop.o: src/multi_nop_loop.asm build/store_uop.o: src/store_uop.asm nasm -g -f elf64 -o build/store_uop.o src/store_uop.asm +build/read_widths.o: src/read_widths.asm + nasm -g -f elf64 -o build/read_widths.o src/read_widths.asm + run: ./build/main ./build/main $(TEST_NAME) diff --git a/src/main.c b/src/main.c index 6922aa1..ca707ea 100644 --- a/src/main.c +++ b/src/main.c @@ -17,6 +17,7 @@ #include "main_load_uop.c" #include "main_store_uop.c" #include "main_short_load_uop.c" +#include "main_read_widths.c" int main(int argc, char **argv) { if (argc < 2) { @@ -28,6 +29,8 @@ int main(int argc, char **argv) { if (!strncmp(test_name, "write_bytes_asm", sizeof("write_bytes_asm"))) { return main_test_write_bytes_asm(); + } else if (!strncmp(test_name, "read_widths", sizeof("read_widths"))) { + return main_test_read_widths(); } else if (!strncmp(test_name, "write_all_bytes", sizeof("write_bytes"))) { return main_test_write_all_bytes(); } else if (!strncmp(test_name, "load_uop", sizeof("load_uop"))) { diff --git a/src/main_read_widths.c b/src/main_read_widths.c new file mode 100644 index 0000000..f08b168 --- /dev/null +++ b/src/main_read_widths.c @@ -0,0 +1,39 @@ +#include "repetition_tester.c" +#include "read_widths.h" + +int main_test_read_widths() { + typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count); + struct testcase { + char *name; + test_cb cb; + }; + + struct testcase cases[] = { + { .name = "read_width_4x2()", .cb = read_width_4x2 }, + { .name = "read_width_8x2()", .cb = read_width_8x2 }, + { .name = "read_width_16x2()", .cb = read_width_16x2 }, + { .name = "read_width_32x2()", .cb = read_width_32x2 }, + }; + + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t byte_count = 4096 * 1024; + uint8_t buffer[byte_count]; + + for (int i = 0; i < ARRAY_LEN(cases); i++) { + struct testcase *testcase = &cases[i]; + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 2)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + testcase->cb(buffer, byte_count); + repetitor_measure_stop(&repetitor, byte_count); + repetitor_stop(&repetitor); + } + repetitor_print_results_label(&repetitor, testcase->name); + } + + return 0; +} diff --git a/src/read_widths.asm b/src/read_widths.asm new file mode 100644 index 0000000..e160087 --- /dev/null +++ b/src/read_widths.asm @@ -0,0 +1,58 @@ +global read_width_4x2 +global read_width_8x2 +global read_width_16x2 +global read_width_32x2 + +section .text + +; rsi - byte_count +; rdi - buffer +read_width_4x2: + xor rcx, rcx + align 64 +.loop: + mov r8d, [rdi] + mov r8d, [rdi + 4] + add rcx, 8 + cmp rcx, rsi + jb .loop + ret + +; rsi - byte_count +; rdi - buffer +read_width_8x2: + xor rcx, rcx + align 64 +.loop: + mov r8, [rdi] + mov r8, [rdi + 8] + add rcx, 16 + cmp rcx, rsi + jb .loop + ret + +; rsi - byte_count +; rdi - buffer +read_width_16x2: + xor rcx, rcx + align 64 +.loop: + vmovdqu xmm0, [rdi] + vmovdqu xmm0, [rdi + 16] + add rcx, 32 + cmp rcx, rsi + jb .loop + ret + +; rsi - byte_count +; rdi - buffer +read_width_32x2: + xor rcx, rcx + align 64 +.loop: + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi + 32] + add rcx, 64 + cmp rcx, rsi + jb .loop + ret diff --git a/src/read_widths.h b/src/read_widths.h new file mode 100644 index 0000000..ff60ee0 --- /dev/null +++ b/src/read_widths.h @@ -0,0 +1,6 @@ +#include + +void read_width_4x2(uint8_t *buffer, uint64_t byte_count); +void read_width_8x2(uint8_t *buffer, uint64_t byte_count); +void read_width_16x2(uint8_t *buffer, uint64_t byte_count); +void read_width_32x2(uint8_t *buffer, uint64_t byte_count);