From cef071d28fbfb69460bcd97e355ddd43a361b173 Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Wed, 13 Mar 2024 21:52:42 +0200 Subject: [PATCH] update cache size tests --- src/cyclic_load_bytes.asm | 35 +++++++++-------- src/cyclic_load_bytes.h | 2 +- src/tests/cache_size.c | 79 +++++++++++++++++++++++++-------------- 3 files changed, 72 insertions(+), 44 deletions(-) diff --git a/src/cyclic_load_bytes.asm b/src/cyclic_load_bytes.asm index 9e94a8a..4ac9f0b 100644 --- a/src/cyclic_load_bytes.asm +++ b/src/cyclic_load_bytes.asm @@ -3,20 +3,25 @@ global cyclic_load_bytes section .text ; rdi - buffer -; rsi - byte_count -; rdx - mask +; rsi - inner_loop_count +; rdx - outer_loop_count cyclic_load_bytes: - xor rcx, rcx - align 64 -.loop: - mov r8, rcx - and r8, rdx - add r8, rdi - vmovdqu ymm0, [r8] - vmovdqu ymm0, [r8 + 32] - vmovdqu ymm0, [r8 + 64] - vmovdqu ymm0, [r8 + 96] - add rcx, 128 - cmp rcx, rsi - jb .loop + xor r8, r8 +.outer_loop: + mov rcx, rdi + xor r9, r9 + + .inner_loop: + vmovdqu ymm0, [rcx] + vmovdqu ymm0, [rcx + 32] + vmovdqu ymm0, [rcx + 64] + vmovdqu ymm0, [rcx + 96] + add rcx, 128 + add r9, 128 + cmp r9, rsi + jb .inner_loop + + inc r8 + cmp r8, rdx + jb .outer_loop ret diff --git a/src/cyclic_load_bytes.h b/src/cyclic_load_bytes.h index 2156604..1ed23a2 100644 --- a/src/cyclic_load_bytes.h +++ b/src/cyclic_load_bytes.h @@ -1,3 +1,3 @@ #include -void cyclic_load_bytes(uint8_t *buffer, uint64_t byte_count, uint64_t mask); +void cyclic_load_bytes(uint8_t *buffer, uint64_t inner_loop_count, uint64_t outer_loop_count); diff --git a/src/tests/cache_size.c b/src/tests/cache_size.c index a892401..3db71b6 100644 --- a/src/tests/cache_size.c +++ b/src/tests/cache_size.c @@ -4,42 +4,60 @@ struct testcase { char *name; - uint64_t mask; + uint64_t byte_count; }; -struct testcase cases[] = { - { .name = "load 4KiB" , .mask = 0b111111111111 }, - { .name = "load 8KiB" , .mask = 0b1111111111111 }, - { .name = "load 16KiB" , .mask = 0b11111111111111 }, - { .name = "load 32KiB" , .mask = 0b111111111111111 }, - { .name = "load 64KiB" , .mask = 0b1111111111111111 }, - { .name = "load 128KiB", .mask = 0b11111111111111111 }, +static uint64_t kibibytes(uint64_t count) { + return 1024 * count; +} - { .name = "load 512KiB", .mask = 0b1111111111111111111 }, - { .name = "load 1MiB" , .mask = 0b11111111111111111111 }, - { .name = "load 2MiB" , .mask = 0b111111111111111111111 }, - { .name = "load 4MiB" , .mask = 0b1111111111111111111111 }, - { .name = "load 8MiB" , .mask = 0b11111111111111111111111 }, - { .name = "load 16MiB" , .mask = 0b111111111111111111111111 }, - { .name = "load 32MiB" , .mask = 0b1111111111111111111111111 }, - { .name = "load 64MiB" , .mask = 0b11111111111111111111111111 }, - { .name = "load 128MiB", .mask = 0b111111111111111111111111111 }, +static uint64_t mibibytes(uint64_t count) { + return 1024 * kibibytes(count); +} + +static uint64_t gibibytes(uint64_t count) { + return 1024 * mibibytes(count); +} - { .name = "load 1GiB" , .mask = 0b111111111111111111111111111111 }, -}; int main() { + struct testcase cases[] = { + { .name = "16KiB" , .byte_count = kibibytes(16) }, + { .name = "32KiB" , .byte_count = kibibytes(32) }, + { .name = "64KiB" , .byte_count = kibibytes(64) }, + { .name = "128KiB", .byte_count = kibibytes(128) }, + { .name = "256KiB", .byte_count = kibibytes(256) }, + { .name = "464KiB", .byte_count = kibibytes(464) }, + { .name = "480KiB", .byte_count = kibibytes(480) }, + { .name = "496KiB", .byte_count = kibibytes(496) }, + { .name = "512KiB", .byte_count = kibibytes(512) }, + { .name = "1MiB" , .byte_count = mibibytes(1) }, + { .name = "2MiB" , .byte_count = mibibytes(2) }, + { .name = "4MiB" , .byte_count = mibibytes(4) }, + { .name = "8MiB" , .byte_count = mibibytes(8) }, + { .name = "16MiB" , .byte_count = mibibytes(16) }, + { .name = "20MiB" , .byte_count = mibibytes(20) }, + { .name = "24MiB" , .byte_count = mibibytes(24) }, + { .name = "30MiB" , .byte_count = mibibytes(30) }, + { .name = "32MiB" , .byte_count = mibibytes(32) }, + { .name = "34MiB" , .byte_count = mibibytes(34) }, + { .name = "40MiB" , .byte_count = mibibytes(40) }, + { .name = "48MiB" , .byte_count = mibibytes(48) }, + { .name = "64MiB" , .byte_count = mibibytes(64) }, + { .name = "128MiB", .byte_count = mibibytes(128) }, + }; + struct repetitor repetitor = {}; repetitor_init(&repetitor); printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); - uint64_t byte_count = 1024 * 1024 * 1024; - if (byte_count % 4096) { + uint64_t buffer_size = gibibytes(1); + if (buffer_size % 4096) { printf("ERROR: Size of buffer is not page aligned\n"); return -1; } - uint8_t *buffer = mmap(0, byte_count, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (buffer == NULL) { printf("ERROR: Failed to allocate buffer\n"); return -1; @@ -51,25 +69,30 @@ int main() { } // Touch pages so they would be mapped in, to avoid page faults during tests - for (uint64_t i = 0; i < byte_count; i += 4096) { + for (uint64_t i = 0; i < buffer_size; i += 4096) { buffer[i] = (uint8_t)i; } for (int i = 0; i < ARRAY_LEN(cases); i++) { struct testcase *testcase = &cases[i]; + assert(testcase->byte_count % 128 == 0); // Must be divisible by 128 + + uint64_t outer_loop_count = buffer_size / testcase->byte_count; + uint64_t bytes_read = outer_loop_count * testcase->byte_count; + repetitor_clear(&repetitor); while (repetitor_repeat(&repetitor, 2)) { repetitor_start(&repetitor); repetitor_measure_start(&repetitor); - cyclic_load_bytes(buffer, byte_count, testcase->mask); - repetitor_measure_stop(&repetitor, byte_count); + cyclic_load_bytes(buffer, testcase->byte_count, outer_loop_count); + repetitor_measure_stop(&repetitor, bytes_read); repetitor_stop(&repetitor); } - repetitor_print_results_label(&repetitor, testcase->name); - // printf("%ld;%f\n", testcase->mask, repetitor_get_best_bandwidth(&repetitor)); + // repetitor_print_results_label(&repetitor, testcase->name); + printf("%s;%f\n", testcase->name, repetitor_get_best_bandwidth(&repetitor)); } - munmap(buffer, byte_count); + munmap(buffer, buffer_size); return 0; }