diff --git a/src/12_unaligned_load_penalties/cyclic_load_bytes.asm b/src/12_unaligned_load_penalties/cyclic_load_bytes.asm new file mode 100644 index 0000000..4ac9f0b --- /dev/null +++ b/src/12_unaligned_load_penalties/cyclic_load_bytes.asm @@ -0,0 +1,27 @@ +global cyclic_load_bytes + +section .text + +; rdi - buffer +; rsi - inner_loop_count +; rdx - outer_loop_count +cyclic_load_bytes: + xor r8, r8 +.outer_loop: + mov rcx, rdi + xor r9, r9 + + .inner_loop: + vmovdqu ymm0, [rcx] + vmovdqu ymm0, [rcx + 32] + vmovdqu ymm0, [rcx + 64] + vmovdqu ymm0, [rcx + 96] + add rcx, 128 + add r9, 128 + cmp r9, rsi + jb .inner_loop + + inc r8 + cmp r8, rdx + jb .outer_loop + ret diff --git a/src/12_unaligned_load_penalties/cyclic_load_bytes.h b/src/12_unaligned_load_penalties/cyclic_load_bytes.h new file mode 100644 index 0000000..1ed23a2 --- /dev/null +++ b/src/12_unaligned_load_penalties/cyclic_load_bytes.h @@ -0,0 +1,3 @@ +#include + +void cyclic_load_bytes(uint8_t *buffer, uint64_t inner_loop_count, uint64_t outer_loop_count); diff --git a/src/12_unaligned_load_penalties/main.c b/src/12_unaligned_load_penalties/main.c new file mode 100644 index 0000000..065bb53 --- /dev/null +++ b/src/12_unaligned_load_penalties/main.c @@ -0,0 +1,77 @@ +#include "repetition_tester.c" +#include "cyclic_load_bytes.h" +#include + +static uint64_t kibibytes(uint64_t count) { + return 1024 * count; +} + +static uint64_t mibibytes(uint64_t count) { + return 1024 * kibibytes(count); +} + +static uint64_t gibibytes(uint64_t count) { + return 1024 * mibibytes(count); +} + +int main() { + // uint32_t byte_counts[] = { kibibytes(16), kibibytes(64), mibibytes(1) }; + uint32_t byte_counts[] = { mibibytes(1) }; + uint64_t offsets[] = { 0, 1, 2, 4, 8, 16, 32, 64 }; + + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t buffer_size = gibibytes(1); + if (buffer_size % 4096) { + printf("ERROR: Size of buffer is not page aligned\n"); + return -1; + } + + uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer == NULL) { + printf("ERROR: Failed to allocate buffer\n"); + return -1; + } + + if ((size_t)buffer % 64 != 0) { + printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64); + return -1; + } + + // Touch pages so they would be mapped in, to avoid page faults during tests + for (uint64_t i = 0; i < buffer_size; i += 4096) { + buffer[i] = (uint8_t)i; + } + + for (int i = 0; i < ARRAY_LEN(byte_counts); i++) { + uint64_t unadjusted_byte_count = byte_counts[i]; + assert(unadjusted_byte_count % 128 == 0); // Must be divisible by 128 + uint64_t byte_count = unadjusted_byte_count - 128; + + for (int j = 0; j < ARRAY_LEN(offsets); j++) { + uint64_t offset = offsets[j]; + + uint64_t outer_loop_count = buffer_size / byte_count; + uint64_t bytes_read = outer_loop_count * byte_count; + + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 5)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + cyclic_load_bytes(buffer + offset, byte_count, outer_loop_count); + repetitor_measure_stop(&repetitor, bytes_read); + repetitor_stop(&repetitor); + } + + char name[128] = { 0 }; + snprintf(name, sizeof(name), "%ld offset, %ld size", offset, byte_count); + repetitor_print_results_label(&repetitor, name); + } + } + + munmap(buffer, buffer_size); + + return 0; +}