1
0

Compare commits

..

2 Commits

Author SHA1 Message Date
0c0a4b6975 add non-temporal store tests 2024-05-19 13:10:11 +03:00
3f66b12c92 add cache set tests 2024-05-17 00:08:46 +03:00
6 changed files with 393 additions and 7 deletions

9
README.md Normal file
View File

@ -0,0 +1,9 @@
# Repetition tester
For: https://www.computerenhance.com/
Zig version used: 0.12.0
## Linux calling convention register order
RDI, RSI, RDX, RCX, R8, R9, [XYZ]MM07

View File

@ -1,7 +1,7 @@
const std = @import("std");
const Builder = std.build.Builder;
const Build = std.Build;
fn addLinuxAssembly(b: *Builder, filename: []const u8) !std.Build.LazyPath {
fn addLinuxAssembly(b: *Build, filename: []const u8) !std.Build.LazyPath {
const obj_basename = try std.mem.concat(b.allocator, u8, &.{
std.fs.path.stem(filename),
".o"
@ -14,13 +14,13 @@ fn addLinuxAssembly(b: *Builder, filename: []const u8) !std.Build.LazyPath {
return output_obj;
}
fn addAllLinuxAssmeblies(b: *Builder, path: []const u8) !std.ArrayList(std.Build.LazyPath) {
fn addAllLinuxAssmeblies(b: *Build, path: []const u8) !std.ArrayList(std.Build.LazyPath) {
const allocator = b.allocator;
var linux_assemblies = std.ArrayList(std.Build.LazyPath).init(allocator);
errdefer linux_assemblies.deinit();
var dir = try std.fs.cwd().openIterableDir(path, .{ });
var dir = try std.fs.cwd().openDir(path, .{ .iterate = true });
var it = dir.iterate();
while (try it.next()) |file| {
if (file.kind != .file) continue;
@ -37,13 +37,13 @@ fn addAllLinuxAssmeblies(b: *Builder, path: []const u8) !std.ArrayList(std.Build
return linux_assemblies;
}
pub fn build(b: *Builder) !void {
pub fn build(b: *Build) !void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const allocator = b.allocator;
var dir = try std.fs.cwd().openIterableDir("src", .{ });
var dir = try std.fs.cwd().openDir("src", .{ .iterate = true });
var it = dir.iterate();
while (try it.next()) |entry| {
if (entry.kind != .directory) continue;
@ -56,12 +56,12 @@ pub fn build(b: *Builder) !void {
const exe = b.addExecutable(.{
.name = entry.name,
.root_source_file = .{ .path = main_c },
.optimize = optimize,
.target = target
});
exe.addIncludePath(.{ .path = program_dir });
exe.addIncludePath(.{ .path = "src" });
exe.addCSourceFile(.{ .file = b.path(main_c) });
exe.linkLibC();
var assemblies = try addAllLinuxAssmeblies(b, program_dir);

View File

@ -0,0 +1,23 @@
global load_bytes
section .text
; rdi - buffer
; rsi - inner_loop_count
; rdx - outer_loop_count
; rcx - step_size
load_bytes:
align 64
.outer_loop:
mov r8, rdi
mov r9, rsi
.inner_loop:
vmovdqu ymm0, [r8]
vmovdqu ymm0, [r8 + 32]
add r8, rcx
dec r9
jnz .inner_loop
dec rdx
jnz .outer_loop
ret

76
src/13_cache_sets/main.c Normal file
View File

@ -0,0 +1,76 @@
#include "repetition_tester.c"
#include <sys/mman.h>
void load_bytes(uint8_t *buffer, uint32_t inner_loop, uint32_t outer_loop, uint32_t step_size);
static uint64_t kibibytes(uint64_t count) {
return 1024 * count;
}
static uint64_t mibibytes(uint64_t count) {
return 1024 * kibibytes(count);
}
static uint64_t gibibytes(uint64_t count) {
return 1024 * mibibytes(count);
}
int main() {
uint32_t byte_count = kibibytes(16);
assert(byte_count % 64 == 0); // Must be divisible by 128
struct repetitor repetitor = {};
repetitor_init(&repetitor);
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
uint64_t buffer_size = gibibytes(1);
if (buffer_size % 4096) {
printf("ERROR: Size of buffer is not page aligned\n");
return -1;
}
uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buffer == NULL) {
printf("ERROR: Failed to allocate buffer\n");
return -1;
}
if ((size_t)buffer % 64 != 0) {
printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64);
return -1;
}
// Touch pages so they would be mapped in, to avoid page faults during tests
for (uint64_t i = 0; i < buffer_size; i += 4096) {
buffer[i] = (uint8_t)i;
}
// uint64_t byte_count = unadjusted_byte_count - 64;
for (int i = 0; i < 128; i++) {
uint64_t step_size = i*64;
uint64_t outer_loop = 64;
uint64_t inner_loop = 256;
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 2)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
load_bytes(buffer, inner_loop, outer_loop, step_size);
repetitor_measure_stop(&repetitor, outer_loop * inner_loop * 64);
repetitor_stop(&repetitor);
}
// char name[128] = { 0 };
// snprintf(name, sizeof(name), "%ld step_size", step_size);
// repetitor_print_results_label(&repetitor, name);
printf("%ld;%f\n", step_size, repetitor_get_best_bandwidth(&repetitor));
}
munmap(buffer, buffer_size);
return 0;
}

View File

@ -0,0 +1,115 @@
#include "repetition_tester.c"
#include <sys/mman.h>
void store_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
void store_non_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
void store_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
void store_non_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_repeat);
static uint64_t kibibytes(uint64_t count) {
return 1024 * count;
}
static uint64_t mibibytes(uint64_t count) {
return 1024 * kibibytes(count);
}
static uint64_t gibibytes(uint64_t count) {
return 1024 * mibibytes(count);
}
int main() {
uint32_t load_size = mibibytes(2);
uint32_t store_repeat = 256;
assert(load_size % 128 == 0); // Must be a 128 byte multiple
struct repetitor repetitor = {};
repetitor_init(&repetitor);
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
uint64_t buffer_size = gibibytes(1);
if (buffer_size % 4096) {
printf("ERROR: Size of buffer is not page aligned\n");
return -1;
}
uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buffer == NULL) {
printf("ERROR: Failed to allocate buffer\n");
return -1;
}
if ((size_t)buffer % 64 != 0) {
printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64);
return -1;
}
// Touch pages so they would be mapped in, to avoid page faults during tests
for (uint64_t i = 0; i < buffer_size; i += 4096) {
buffer[i] = (uint8_t)i;
}
assert(load_size * store_repeat <= buffer_size/2);
uint8_t *load_buffer = buffer;
uint8_t *store_buffer = buffer + buffer_size / 2;
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_temporal_v1(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "temporal v1");
}
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_non_temporal_v1(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "non-temporal v1");
}
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_temporal_v2(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "temporal v2");
}
{
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 10)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
store_non_temporal_v2(load_buffer, load_size, store_buffer, store_repeat);
repetitor_measure_stop(&repetitor, load_size * (1+store_repeat));
repetitor_stop(&repetitor);
}
repetitor_print_results_label(&repetitor, "non-temporal v2");
}
munmap(buffer, buffer_size);
return 0;
}

View File

@ -0,0 +1,163 @@
global store_temporal_v1
global store_non_temporal_v1
global store_temporal_v2
global store_non_temporal_v2
section .text
; void store_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_temporal_v1:
push r10
push r11
push r12
mov r11, rdx
.many_copy:
mov r10, rdi
mov r12, rsi
.single_copy:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
vmovdqu [r11 + 0], ymm0
vmovdqu [r11 + 32], ymm1
vmovdqu [r11 + 64], ymm2
vmovdqu [r11 + 96], ymm3
add r10, 128
add r11, 128
sub r12, 128
jnz .single_copy
dec rcx
jnz .many_copy
pop r12
pop r11
pop r10
ret
; void store_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_temporal_v2:
push r10
push r11
push r12
push r13
mov r10, rdi
mov r13, rsi
.outer_loop:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
mov r11, rdx
mov r12, rcx
.inner_loop:
vmovdqu [r11 + 0], ymm0
vmovdqu [r11 + 32], ymm1
vmovdqu [r11 + 64], ymm2
vmovdqu [r11 + 96], ymm3
add r11, rsi
dec r12
jnz .inner_loop
add r10, 128
sub r13, 128
jnz .outer_loop
pop r13
pop r12
pop r11
pop r10
ret
; void store_non_temporal_v1(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_non_temporal_v1:
push r10
push r11
push r12
mov r11, rdx
.many_copy:
mov r10, rdi
mov r12, rsi
.single_copy:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
vmovntdq [r11 + 0], ymm0
vmovntdq [r11 + 32], ymm1
vmovntdq [r11 + 64], ymm2
vmovntdq [r11 + 96], ymm3
add r10, 128
add r11, 128
sub r12, 128
jnz .single_copy
dec rcx
jnz .many_copy
pop r12
pop r11
pop r10
ret
; void store_non_temporal_v2(uint8_t *load_buffer, uint32_t load_size, uint8_t *store_buffer, uint32_t store_size);
; rdi - load buffer
; rsi - load size
; rdx - store buffer
; rcx - store repeat
store_non_temporal_v2:
push r10
push r11
push r12
push r13
mov r10, rdi
mov r13, rsi
.outer_loop:
vmovdqu ymm0, [r10 + 0]
vmovdqu ymm1, [r10 + 32]
vmovdqu ymm2, [r10 + 64]
vmovdqu ymm3, [r10 + 96]
mov r11, rdx
mov r12, rcx
.inner_loop:
vmovntdq [r11 + 0], ymm0
vmovntdq [r11 + 32], ymm1
vmovntdq [r11 + 64], ymm2
vmovntdq [r11 + 96], ymm3
add r11, rsi
dec r12
jnz .inner_loop
add r10, 128
sub r13, 128
jnz .outer_loop
pop r13
pop r12
pop r11
pop r10
ret