1
0

add non-temporal store tests

This commit is contained in:
Rokas Puzonas 2024-05-19 13:10:11 +03:00
parent 3f66b12c92
commit 0c0a4b6975
2 changed files with 278 additions and 0 deletions

View File

@ -0,0 +1,115 @@
#include "repetition_tester.c"
#include <sys/mman.h>
void store_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
void store_non_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
void store_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
void store_non_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
static uint64_t kibibytes(uint64_t count) {
return 1024 * count;
}
static uint64_t mibibytes(uint64_t count) {
return 1024 * kibibytes(count);
}
static uint64_t gibibytes(uint64_t count) {
return 1024 * mibibytes(count);
}
int main() {
uint32_t load_size = mibibytes(2);
uint32_t store_repeat = 256;
assert(load_size % 128 == 0); // Must be a 128 byte multiple
struct repetitor repetitor = {};
repetitor_init(&repetitor);
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
uint64_t buffer_size = gibibytes(1);
if (buffer_size % 4096) {
printf("ERROR: Size of buffer is not page aligned\n");
return -1;
}
uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buffer == NULL) {
printf("ERROR: Failed to allocate buffer\n");
return -1;
}
if ((size_t)buffer % 64 != 0) {
printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64);
return -1;
}
// Touch pages so they would be mapped in, to avoid page faults during tests
for (uint64_t i = 0; i < buffer_size; i += 4096) {
buffer[i] = (uint8_t)i;
}
assert(load_size * store_repeat <= buffer_size/2);
uint8_t *load_buffer = buffer;
uint8_t *store_buffer = buffer + buffer_size / 2;
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_temporal_v1(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "temporal v1");
}
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_non_temporal_v1(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "non-temporal v1");
}
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_temporal_v2(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "temporal v2");
}
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_non_temporal_v2(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "non-temporal v2");
}
munmap(buffer, buffer_size);
return 0;
}

View File

@ -0,0 +1,163 @@
global store_temporal_v1
global store_non_temporal_v1
global store_temporal_v2
global store_non_temporal_v2
section .text
; void store_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_temporal_v1:
push r10
push r11
push r12
mov r11, rdx
.many_copy:
mov r10, rdi
mov r12, rsi
.single_copy:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
vmovdqu [r11 + 0], ymm0
vmovdqu [r11 + 32], ymm1
vmovdqu [r11 + 64], ymm2
vmovdqu [r11 + 96], ymm3
add r10, 128
add r11, 128
sub r12, 128
jnz .single_copy
dec rcx
jnz .many_copy
pop r12
pop r11
pop r10
ret
; void store_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_temporal_v2:
push r10
push r11
push r12
push r13
mov r10, rdi
mov r13, rsi
.outer_loop:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
mov r11, rdx
mov r12, rcx
.inner_loop:
vmovdqu [r11 + 0], ymm0
vmovdqu [r11 + 32], ymm1
vmovdqu [r11 + 64], ymm2
vmovdqu [r11 + 96], ymm3
add r11, rsi
dec r12
jnz .inner_loop
add r10, 128
sub r13, 128
jnz .outer_loop
pop r13
pop r12
pop r11
pop r10
ret
; void store_non_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_non_temporal_v1:
push r10
push r11
push r12
mov r11, rdx
.many_copy:
mov r10, rdi
mov r12, rsi
.single_copy:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
vmovntdq [r11 + 0], ymm0
vmovntdq [r11 + 32], ymm1
vmovntdq [r11 + 64], ymm2
vmovntdq [r11 + 96], ymm3
add r10, 128
add r11, 128
sub r12, 128
jnz .single_copy
dec rcx
jnz .many_copy
pop r12
pop r11
pop r10
ret
; void store_non_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_non_temporal_v2:
push r10
push r11
push r12
push r13
mov r10, rdi
mov r13, rsi
.outer_loop:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
mov r11, rdx
mov r12, rcx
.inner_loop:
vmovntdq [r11 + 0], ymm0
vmovntdq [r11 + 32], ymm1
vmovntdq [r11 + 64], ymm2
vmovntdq [r11 + 96], ymm3
add r11, rsi
dec r12
jnz .inner_loop
add r10, 128
sub r13, 128
jnz .outer_loop
pop r13
pop r12
pop r11
pop r10
ret