add unaligned load penalty tests
This commit is contained in:
parent
adcb0891fb
commit
19b4bf5fbf
27
src/12_unaligned_load_penalties/cyclic_load_bytes.asm
Normal file
27
src/12_unaligned_load_penalties/cyclic_load_bytes.asm
Normal file
@ -0,0 +1,27 @@
|
||||
global cyclic_load_bytes
|
||||
|
||||
section .text
|
||||
|
||||
; rdi - buffer
|
||||
; rsi - inner_loop_count
|
||||
; rdx - outer_loop_count
|
||||
cyclic_load_bytes:
|
||||
xor r8, r8
|
||||
.outer_loop:
|
||||
mov rcx, rdi
|
||||
xor r9, r9
|
||||
|
||||
.inner_loop:
|
||||
vmovdqu ymm0, [rcx]
|
||||
vmovdqu ymm0, [rcx + 32]
|
||||
vmovdqu ymm0, [rcx + 64]
|
||||
vmovdqu ymm0, [rcx + 96]
|
||||
add rcx, 128
|
||||
add r9, 128
|
||||
cmp r9, rsi
|
||||
jb .inner_loop
|
||||
|
||||
inc r8
|
||||
cmp r8, rdx
|
||||
jb .outer_loop
|
||||
ret
|
3
src/12_unaligned_load_penalties/cyclic_load_bytes.h
Normal file
3
src/12_unaligned_load_penalties/cyclic_load_bytes.h
Normal file
@ -0,0 +1,3 @@
|
||||
#include <stdint.h>
|
||||
|
||||
void cyclic_load_bytes(uint8_t *buffer, uint64_t inner_loop_count, uint64_t outer_loop_count);
|
77
src/12_unaligned_load_penalties/main.c
Normal file
77
src/12_unaligned_load_penalties/main.c
Normal file
@ -0,0 +1,77 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "cyclic_load_bytes.h"
|
||||
#include <sys/mman.h>
|
||||
|
||||
static uint64_t kibibytes(uint64_t count) {
|
||||
return 1024 * count;
|
||||
}
|
||||
|
||||
static uint64_t mibibytes(uint64_t count) {
|
||||
return 1024 * kibibytes(count);
|
||||
}
|
||||
|
||||
static uint64_t gibibytes(uint64_t count) {
|
||||
return 1024 * mibibytes(count);
|
||||
}
|
||||
|
||||
int main() {
|
||||
// uint32_t byte_counts[] = { kibibytes(16), kibibytes(64), mibibytes(1) };
|
||||
uint32_t byte_counts[] = { mibibytes(1) };
|
||||
uint64_t offsets[] = { 0, 1, 2, 4, 8, 16, 32, 64 };
|
||||
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
||||
|
||||
uint64_t buffer_size = gibibytes(1);
|
||||
if (buffer_size % 4096) {
|
||||
printf("ERROR: Size of buffer is not page aligned\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (buffer == NULL) {
|
||||
printf("ERROR: Failed to allocate buffer\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((size_t)buffer % 64 != 0) {
|
||||
printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Touch pages so they would be mapped in, to avoid page faults during tests
|
||||
for (uint64_t i = 0; i < buffer_size; i += 4096) {
|
||||
buffer[i] = (uint8_t)i;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ARRAY_LEN(byte_counts); i++) {
|
||||
uint64_t unadjusted_byte_count = byte_counts[i];
|
||||
assert(unadjusted_byte_count % 128 == 0); // Must be divisible by 128
|
||||
uint64_t byte_count = unadjusted_byte_count - 128;
|
||||
|
||||
for (int j = 0; j < ARRAY_LEN(offsets); j++) {
|
||||
uint64_t offset = offsets[j];
|
||||
|
||||
uint64_t outer_loop_count = buffer_size / byte_count;
|
||||
uint64_t bytes_read = outer_loop_count * byte_count;
|
||||
|
||||
repetitor_clear(&repetitor);
|
||||
while (repetitor_repeat(&repetitor, 5)) {
|
||||
repetitor_start(&repetitor);
|
||||
repetitor_measure_start(&repetitor);
|
||||
cyclic_load_bytes(buffer + offset, byte_count, outer_loop_count);
|
||||
repetitor_measure_stop(&repetitor, bytes_read);
|
||||
repetitor_stop(&repetitor);
|
||||
}
|
||||
|
||||
char name[128] = { 0 };
|
||||
snprintf(name, sizeof(name), "%ld offset, %ld size", offset, byte_count);
|
||||
repetitor_print_results_label(&repetitor, name);
|
||||
}
|
||||
}
|
||||
|
||||
munmap(buffer, buffer_size);
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user