diff --git a/src/cyclic_load_bytes.asm b/src/cyclic_load_bytes.asm new file mode 100644 index 0000000..9e94a8a --- /dev/null +++ b/src/cyclic_load_bytes.asm @@ -0,0 +1,22 @@ +global cyclic_load_bytes + +section .text + +; rdi - buffer +; rsi - byte_count +; rdx - mask +cyclic_load_bytes: + xor rcx, rcx + align 64 +.loop: + mov r8, rcx + and r8, rdx + add r8, rdi + vmovdqu ymm0, [r8] + vmovdqu ymm0, [r8 + 32] + vmovdqu ymm0, [r8 + 64] + vmovdqu ymm0, [r8 + 96] + add rcx, 128 + cmp rcx, rsi + jb .loop + ret diff --git a/src/cyclic_load_bytes.h b/src/cyclic_load_bytes.h new file mode 100644 index 0000000..2156604 --- /dev/null +++ b/src/cyclic_load_bytes.h @@ -0,0 +1,3 @@ +#include + +void cyclic_load_bytes(uint8_t *buffer, uint64_t byte_count, uint64_t mask); diff --git a/src/repetition_tester.c b/src/repetition_tester.c index 2443a53..7e668ff 100644 --- a/src/repetition_tester.c +++ b/src/repetition_tester.c @@ -165,6 +165,12 @@ static float divide_safe(float a, float b) { return b != 0 ? a / b : 0; } +float repetitor_get_best_bandwidth(struct repetitor *repetitor) { + uint64_t min_time_taken = repetitor->min_time_taken; + uint64_t min_byte_count = repetitor->min_byte_count; + return bytes_to_gb_s(repetitor, min_byte_count, min_time_taken); +} + void repetitor_print_results(struct repetitor *repetitor) { uint64_t avg_time_taken = repetitor->total_time_taken/repetitor->repetition_count; uint64_t min_time_taken = repetitor->min_time_taken; diff --git a/src/tests/cache_size.c b/src/tests/cache_size.c new file mode 100644 index 0000000..a892401 --- /dev/null +++ b/src/tests/cache_size.c @@ -0,0 +1,75 @@ +#include "repetition_tester.c" +#include "cyclic_load_bytes.h" +#include + +struct testcase { + char *name; + uint64_t mask; +}; + +struct testcase cases[] = { + { .name = "load 4KiB" , .mask = 0b111111111111 }, + { .name = "load 8KiB" , .mask = 0b1111111111111 }, + { .name = "load 16KiB" , .mask = 0b11111111111111 }, + { .name = "load 32KiB" , .mask = 0b111111111111111 }, + { .name = "load 64KiB" , .mask = 0b1111111111111111 }, + { .name = "load 128KiB", .mask = 0b11111111111111111 }, + + { .name = "load 512KiB", .mask = 0b1111111111111111111 }, + { .name = "load 1MiB" , .mask = 0b11111111111111111111 }, + { .name = "load 2MiB" , .mask = 0b111111111111111111111 }, + { .name = "load 4MiB" , .mask = 0b1111111111111111111111 }, + { .name = "load 8MiB" , .mask = 0b11111111111111111111111 }, + { .name = "load 16MiB" , .mask = 0b111111111111111111111111 }, + { .name = "load 32MiB" , .mask = 0b1111111111111111111111111 }, + { .name = "load 64MiB" , .mask = 0b11111111111111111111111111 }, + { .name = "load 128MiB", .mask = 0b111111111111111111111111111 }, + + { .name = "load 1GiB" , .mask = 0b111111111111111111111111111111 }, +}; + +int main() { + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t byte_count = 1024 * 1024 * 1024; + if (byte_count % 4096) { + printf("ERROR: Size of buffer is not page aligned\n"); + return -1; + } + + uint8_t *buffer = mmap(0, byte_count, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer == NULL) { + printf("ERROR: Failed to allocate buffer\n"); + return -1; + } + + if ((size_t)buffer % 64 != 0) { + printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64); + return -1; + } + + // Touch pages so they would be mapped in, to avoid page faults during tests + for (uint64_t i = 0; i < byte_count; i += 4096) { + buffer[i] = (uint8_t)i; + } + + for (int i = 0; i < ARRAY_LEN(cases); i++) { + struct testcase *testcase = &cases[i]; + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 2)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + cyclic_load_bytes(buffer, byte_count, testcase->mask); + repetitor_measure_stop(&repetitor, byte_count); + repetitor_stop(&repetitor); + } + repetitor_print_results_label(&repetitor, testcase->name); + // printf("%ld;%f\n", testcase->mask, repetitor_get_best_bandwidth(&repetitor)); + } + + munmap(buffer, byte_count); + + return 0; +} diff --git a/src/tests/load_uop.c b/src/tests/load_uop.c index 4d7d149..4c85231 100644 --- a/src/tests/load_uop.c +++ b/src/tests/load_uop.c @@ -1,20 +1,20 @@ #include "repetition_tester.c" #include "load_uop.h" +typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count); +struct testcase { + char *name; + test_cb cb; +}; + +struct testcase cases[] = { + { .name = "mov_load_x1()", .cb = mov_load_x1 }, + { .name = "mov_load_x2()", .cb = mov_load_x2 }, + { .name = "mov_load_x3()", .cb = mov_load_x3 }, + { .name = "mov_load_x4()", .cb = mov_load_x4 }, +}; + int main() { - typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count); - struct testcase { - char *name; - test_cb cb; - }; - - struct testcase cases[] = { - { .name = "mov_load_x1()", .cb = mov_load_x1 }, - { .name = "mov_load_x2()", .cb = mov_load_x2 }, - { .name = "mov_load_x3()", .cb = mov_load_x3 }, - { .name = "mov_load_x4()", .cb = mov_load_x4 }, - }; - struct repetitor repetitor = {}; repetitor_init(&repetitor); printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));