From 0c0a4b6975a6baf51bda0a9b29d9dd2c8ad9b4c3 Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Sun, 19 May 2024 13:10:11 +0300 Subject: [PATCH] add non-temporal store tests --- src/14_non_temporal_store/main.c | 115 ++++++++++++ .../non_temporal_store.asm | 163 ++++++++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 src/14_non_temporal_store/main.c create mode 100644 src/14_non_temporal_store/non_temporal_store.asm diff --git a/src/14_non_temporal_store/main.c b/src/14_non_temporal_store/main.c new file mode 100644 index 0000000..6792dff --- /dev/null +++ b/src/14_non_temporal_store/main.c @@ -0,0 +1,115 @@ +#include "repetition_tester.c" +#include + +void store_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat); +void store_non_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat); + +void store_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat); +void store_non_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat); + +static uint64_t kibibytes(uint64_t count) { + return 1024 * count; +} + +static uint64_t mibibytes(uint64_t count) { + return 1024 * kibibytes(count); +} + +static uint64_t gibibytes(uint64_t count) { + return 1024 * mibibytes(count); +} + +int main() { + uint32_t load_size = mibibytes(2); + uint32_t store_repeat = 256; + assert(load_size % 128 == 0); // Must be a 128 byte multiple + + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t buffer_size = gibibytes(1); + if (buffer_size % 4096) { + printf("ERROR: Size of buffer is not page aligned\n"); + return -1; + } + + uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer == NULL) { + printf("ERROR: Failed to allocate buffer\n"); + return -1; + } + + if ((size_t)buffer % 64 != 0) { + printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64); + return -1; + } + + // Touch pages so they would be mapped in, to avoid page faults during tests + for (uint64_t i = 0; i < buffer_size; i += 4096) { + buffer[i] = (uint8_t)i; + } + + assert(load_size * store_repeat <= buffer_size/2); + + uint8_t *load_buffer = buffer; + uint8_t *store_buffer = buffer + buffer_size / 2; + + { + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 10)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + store_temporal_v1(load_buffer, load_size, store_buffer, store_repeat); + repetitor_measure_stop(&repetitor, load_size * (1+store_repeat)); + repetitor_stop(&repetitor); + } + + repetitor_print_results_label(&repetitor, "temporal v1"); + } + + { + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 10)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + store_non_temporal_v1(load_buffer, load_size, store_buffer, store_repeat); + repetitor_measure_stop(&repetitor, load_size * (1+store_repeat)); + repetitor_stop(&repetitor); + } + + repetitor_print_results_label(&repetitor, "non-temporal v1"); + } + + { + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 10)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + store_temporal_v2(load_buffer, load_size, store_buffer, store_repeat); + repetitor_measure_stop(&repetitor, load_size * (1+store_repeat)); + repetitor_stop(&repetitor); + } + + repetitor_print_results_label(&repetitor, "temporal v2"); + } + + { + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 10)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + store_non_temporal_v2(load_buffer, load_size, store_buffer, store_repeat); + repetitor_measure_stop(&repetitor, load_size * (1+store_repeat)); + repetitor_stop(&repetitor); + } + + repetitor_print_results_label(&repetitor, "non-temporal v2"); + } + + munmap(buffer, buffer_size); + + return 0; +} + + diff --git a/src/14_non_temporal_store/non_temporal_store.asm b/src/14_non_temporal_store/non_temporal_store.asm new file mode 100644 index 0000000..49c89b3 --- /dev/null +++ b/src/14_non_temporal_store/non_temporal_store.asm @@ -0,0 +1,163 @@ +global store_temporal_v1 +global store_non_temporal_v1 + +global store_temporal_v2 +global store_non_temporal_v2 + +section .text + +; void store_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size); +; rdi - load buffer +; rsi - load size +; rdx - store buffer +; rcx - store repeat +store_temporal_v1: + push r10 + push r11 + push r12 + + mov r11, rdx + .many_copy: + mov r10, rdi + mov r12, rsi + .single_copy: + vmovdqu ymm0, [r10 + 0] + vmovdqu ymm1, [r10 + 32] + vmovdqu ymm2, [r10 + 64] + vmovdqu ymm3, [r10 + 96] + + vmovdqu [r11 + 0], ymm0 + vmovdqu [r11 + 32], ymm1 + vmovdqu [r11 + 64], ymm2 + vmovdqu [r11 + 96], ymm3 + + add r10, 128 + add r11, 128 + + sub r12, 128 + jnz .single_copy + dec rcx + jnz .many_copy + + pop r12 + pop r11 + pop r10 + ret + +; void store_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size); +; rdi - load buffer +; rsi - load size +; rdx - store buffer +; rcx - store repeat +store_temporal_v2: + push r10 + push r11 + push r12 + push r13 + + mov r10, rdi + mov r13, rsi + .outer_loop: + vmovdqu ymm0, [r10 + 0] + vmovdqu ymm1, [r10 + 32] + vmovdqu ymm2, [r10 + 64] + vmovdqu ymm3, [r10 + 96] + + mov r11, rdx + mov r12, rcx + .inner_loop: + vmovdqu [r11 + 0], ymm0 + vmovdqu [r11 + 32], ymm1 + vmovdqu [r11 + 64], ymm2 + vmovdqu [r11 + 96], ymm3 + add r11, rsi + dec r12 + jnz .inner_loop + + add r10, 128 + sub r13, 128 + jnz .outer_loop + + pop r13 + pop r12 + pop r11 + pop r10 + ret + +; void store_non_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size); +; rdi - load buffer +; rsi - load size +; rdx - store buffer +; rcx - store repeat +store_non_temporal_v1: + push r10 + push r11 + push r12 + + mov r11, rdx + .many_copy: + mov r10, rdi + mov r12, rsi + .single_copy: + vmovdqu ymm0, [r10 + 0] + vmovdqu ymm1, [r10 + 32] + vmovdqu ymm2, [r10 + 64] + vmovdqu ymm3, [r10 + 96] + + vmovntdq [r11 + 0], ymm0 + vmovntdq [r11 + 32], ymm1 + vmovntdq [r11 + 64], ymm2 + vmovntdq [r11 + 96], ymm3 + + add r10, 128 + add r11, 128 + + sub r12, 128 + jnz .single_copy + dec rcx + jnz .many_copy + + pop r12 + pop r11 + pop r10 + ret + +; void store_non_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size); +; rdi - load buffer +; rsi - load size +; rdx - store buffer +; rcx - store repeat +store_non_temporal_v2: + push r10 + push r11 + push r12 + push r13 + + mov r10, rdi + mov r13, rsi + .outer_loop: + vmovdqu ymm0, [r10 + 0] + vmovdqu ymm1, [r10 + 32] + vmovdqu ymm2, [r10 + 64] + vmovdqu ymm3, [r10 + 96] + + mov r11, rdx + mov r12, rcx + .inner_loop: + vmovntdq [r11 + 0], ymm0 + vmovntdq [r11 + 32], ymm1 + vmovntdq [r11 + 64], ymm2 + vmovntdq [r11 + 96], ymm3 + add r11, rsi + dec r12 + jnz .inner_loop + + add r10, 128 + sub r13, 128 + jnz .outer_loop + + pop r13 + pop r12 + pop r11 + pop r10 + ret