Compare commits
2 Commits
8473de5f13
...
4fb4fdbc36
Author | SHA1 | Date | |
---|---|---|---|
4fb4fdbc36 | |||
d275d207a7 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
||||
build
|
||||
zig-cache
|
||||
zig-out
|
||||
|
31
Makefile
31
Makefile
@ -1,31 +0,0 @@
|
||||
CFLAGS=-lm -g -Wall -O1
|
||||
|
||||
# TODO: Move this to a build.zig. This is becoming a mess, I don't want to deal with Makefiles anymore
|
||||
|
||||
build/main: src/main.c src/repetition_tester.c build/multi_nop_loop.o build/write_loops.o build/load_uop.o build/store_uop.o build/short_load_uop.o src/rprof.h build/read_widths.o
|
||||
mkdir -p build
|
||||
gcc -o build/main src/main.c build/multi_nop_loop.o build/write_loops.o build/load_uop.o build/store_uop.o build/short_load_uop.o build/read_widths.o $(CFLAGS)
|
||||
|
||||
build/short_load_uop.o: src/short_load_uop.asm
|
||||
nasm -g -f elf64 -o build/short_load_uop.o src/short_load_uop.asm
|
||||
|
||||
build/load_uop.o: src/load_uop.asm
|
||||
nasm -g -f elf64 -o build/load_uop.o src/load_uop.asm
|
||||
|
||||
build/write_loops.o: src/write_loops.asm
|
||||
nasm -g -f elf64 -o build/write_loops.o src/write_loops.asm
|
||||
|
||||
build/multi_nop_loop.o: src/multi_nop_loop.asm
|
||||
nasm -g -f elf64 -o build/multi_nop_loop.o src/multi_nop_loop.asm
|
||||
|
||||
build/store_uop.o: src/store_uop.asm
|
||||
nasm -g -f elf64 -o build/store_uop.o src/store_uop.asm
|
||||
|
||||
build/read_widths.o: src/read_widths.asm
|
||||
nasm -g -f elf64 -o build/read_widths.o src/read_widths.asm
|
||||
|
||||
run: ./build/main
|
||||
./build/main $(TEST_NAME)
|
||||
|
||||
clean:
|
||||
rm -r build
|
83
build.zig
Normal file
83
build.zig
Normal file
@ -0,0 +1,83 @@
|
||||
const std = @import("std");
|
||||
const Builder = std.build.Builder;
|
||||
|
||||
fn addLinuxAssembly(b: *Builder, filename: []const u8) !std.Build.LazyPath {
|
||||
const obj_basename = try std.mem.concat(b.allocator, u8, &.{
|
||||
std.fs.path.stem(filename),
|
||||
".o"
|
||||
});
|
||||
|
||||
const obj = b.addSystemCommand(&.{ "nasm", "-g", "-f", "elf64", "-o" });
|
||||
const output_obj = obj.addOutputFileArg(obj_basename);
|
||||
obj.addFileArg(.{ .path = filename });
|
||||
|
||||
return output_obj;
|
||||
}
|
||||
|
||||
fn addAllLinuxAssmeblies(b: *Builder) !std.ArrayList(std.Build.LazyPath) {
|
||||
var linux_assemblies = std.ArrayList(std.Build.LazyPath).init(b.allocator);
|
||||
errdefer linux_assemblies.deinit();
|
||||
|
||||
var dir = try std.fs.cwd().openIterableDir("src", .{ });
|
||||
var it = dir.iterate();
|
||||
while (try it.next()) |file| {
|
||||
if (file.kind != .file) continue;
|
||||
|
||||
const ext = std.fs.path.extension(file.name);
|
||||
if (!std.mem.eql(u8, ext, ".asm")) continue;
|
||||
|
||||
const assembly_path = try std.mem.concat(b.allocator, u8, &.{ "src/", file.name });
|
||||
defer b.allocator.free(assembly_path);
|
||||
|
||||
try linux_assemblies.append(try addLinuxAssembly(b, assembly_path));
|
||||
}
|
||||
|
||||
return linux_assemblies;
|
||||
}
|
||||
|
||||
pub fn build(b: *Builder) !void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
var linux_assemblies = try addAllLinuxAssmeblies(b);
|
||||
defer linux_assemblies.deinit();
|
||||
|
||||
var dir = try std.fs.cwd().openIterableDir("src/tests", .{ });
|
||||
var it = dir.iterate();
|
||||
while (try it.next()) |file| {
|
||||
if (file.kind != .file) continue;
|
||||
|
||||
const ext = std.fs.path.extension(file.name);
|
||||
if (!std.mem.eql(u8, ext, ".c")) continue;
|
||||
|
||||
const source_file_path = try std.mem.concat(b.allocator, u8, &.{ "src/tests/", file.name });
|
||||
defer b.allocator.free(source_file_path);
|
||||
|
||||
const executable_name = std.fs.path.stem(file.name);
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = executable_name,
|
||||
.root_source_file = .{ .path = source_file_path },
|
||||
.optimize = optimize,
|
||||
.target = target
|
||||
});
|
||||
exe.addIncludePath(.{ .path = "src" });
|
||||
exe.linkLibC();
|
||||
|
||||
for (linux_assemblies.items) |obj| {
|
||||
exe.addObjectFile(obj);
|
||||
}
|
||||
|
||||
b.installArtifact(exe);
|
||||
|
||||
const run_exe = b.addRunArtifact(exe);
|
||||
if (b.args) |args| {
|
||||
run_exe.addArgs(args);
|
||||
}
|
||||
|
||||
const step_description = try std.fmt.allocPrint(b.allocator, "Run '{s}' test", .{source_file_path});
|
||||
defer b.allocator.free(step_description);
|
||||
const run_step = b.step(executable_name, step_description);
|
||||
run_step.dependOn(&run_exe.step);
|
||||
}
|
||||
}
|
1
compile_flags.txt
Normal file
1
compile_flags.txt
Normal file
@ -0,0 +1 @@
|
||||
-Isrc
|
22
src/cyclic_load_bytes.asm
Normal file
22
src/cyclic_load_bytes.asm
Normal file
@ -0,0 +1,22 @@
|
||||
global cyclic_load_bytes
|
||||
|
||||
section .text
|
||||
|
||||
; rdi - buffer
|
||||
; rsi - byte_count
|
||||
; rdx - mask
|
||||
cyclic_load_bytes:
|
||||
xor rcx, rcx
|
||||
align 64
|
||||
.loop:
|
||||
mov r8, rcx
|
||||
and r8, rdx
|
||||
add r8, rdi
|
||||
vmovdqu ymm0, [r8]
|
||||
vmovdqu ymm0, [r8 + 32]
|
||||
vmovdqu ymm0, [r8 + 64]
|
||||
vmovdqu ymm0, [r8 + 96]
|
||||
add rcx, 128
|
||||
cmp rcx, rsi
|
||||
jb .loop
|
||||
ret
|
3
src/cyclic_load_bytes.h
Normal file
3
src/cyclic_load_bytes.h
Normal file
@ -0,0 +1,3 @@
|
||||
#include <stdint.h>
|
||||
|
||||
void cyclic_load_bytes(uint8_t *buffer, uint64_t byte_count, uint64_t mask);
|
@ -165,6 +165,12 @@ static float divide_safe(float a, float b) {
|
||||
return b != 0 ? a / b : 0;
|
||||
}
|
||||
|
||||
float repetitor_get_best_bandwidth(struct repetitor *repetitor) {
|
||||
uint64_t min_time_taken = repetitor->min_time_taken;
|
||||
uint64_t min_byte_count = repetitor->min_byte_count;
|
||||
return bytes_to_gb_s(repetitor, min_byte_count, min_time_taken);
|
||||
}
|
||||
|
||||
void repetitor_print_results(struct repetitor *repetitor) {
|
||||
uint64_t avg_time_taken = repetitor->total_time_taken/repetitor->repetition_count;
|
||||
uint64_t min_time_taken = repetitor->min_time_taken;
|
||||
|
75
src/tests/cache_size.c
Normal file
75
src/tests/cache_size.c
Normal file
@ -0,0 +1,75 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "cyclic_load_bytes.h"
|
||||
#include <sys/mman.h>
|
||||
|
||||
struct testcase {
|
||||
char *name;
|
||||
uint64_t mask;
|
||||
};
|
||||
|
||||
struct testcase cases[] = {
|
||||
{ .name = "load 4KiB" , .mask = 0b111111111111 },
|
||||
{ .name = "load 8KiB" , .mask = 0b1111111111111 },
|
||||
{ .name = "load 16KiB" , .mask = 0b11111111111111 },
|
||||
{ .name = "load 32KiB" , .mask = 0b111111111111111 },
|
||||
{ .name = "load 64KiB" , .mask = 0b1111111111111111 },
|
||||
{ .name = "load 128KiB", .mask = 0b11111111111111111 },
|
||||
|
||||
{ .name = "load 512KiB", .mask = 0b1111111111111111111 },
|
||||
{ .name = "load 1MiB" , .mask = 0b11111111111111111111 },
|
||||
{ .name = "load 2MiB" , .mask = 0b111111111111111111111 },
|
||||
{ .name = "load 4MiB" , .mask = 0b1111111111111111111111 },
|
||||
{ .name = "load 8MiB" , .mask = 0b11111111111111111111111 },
|
||||
{ .name = "load 16MiB" , .mask = 0b111111111111111111111111 },
|
||||
{ .name = "load 32MiB" , .mask = 0b1111111111111111111111111 },
|
||||
{ .name = "load 64MiB" , .mask = 0b11111111111111111111111111 },
|
||||
{ .name = "load 128MiB", .mask = 0b111111111111111111111111111 },
|
||||
|
||||
{ .name = "load 1GiB" , .mask = 0b111111111111111111111111111111 },
|
||||
};
|
||||
|
||||
int main() {
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
||||
|
||||
uint64_t byte_count = 1024 * 1024 * 1024;
|
||||
if (byte_count % 4096) {
|
||||
printf("ERROR: Size of buffer is not page aligned\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint8_t *buffer = mmap(0, byte_count, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (buffer == NULL) {
|
||||
printf("ERROR: Failed to allocate buffer\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((size_t)buffer % 64 != 0) {
|
||||
printf("ERROR: Allocated buffer is not cache line aligned, it is %ld\n", (size_t)buffer % 64);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Touch pages so they would be mapped in, to avoid page faults during tests
|
||||
for (uint64_t i = 0; i < byte_count; i += 4096) {
|
||||
buffer[i] = (uint8_t)i;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ARRAY_LEN(cases); i++) {
|
||||
struct testcase *testcase = &cases[i];
|
||||
repetitor_clear(&repetitor);
|
||||
while (repetitor_repeat(&repetitor, 2)) {
|
||||
repetitor_start(&repetitor);
|
||||
repetitor_measure_start(&repetitor);
|
||||
cyclic_load_bytes(buffer, byte_count, testcase->mask);
|
||||
repetitor_measure_stop(&repetitor, byte_count);
|
||||
repetitor_stop(&repetitor);
|
||||
}
|
||||
repetitor_print_results_label(&repetitor, testcase->name);
|
||||
// printf("%ld;%f\n", testcase->mask, repetitor_get_best_bandwidth(&repetitor));
|
||||
}
|
||||
|
||||
munmap(buffer, byte_count);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,20 +1,20 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "load_uop.h"
|
||||
|
||||
int main_test_load_uop() {
|
||||
typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
||||
test_cb cb;
|
||||
};
|
||||
typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
||||
test_cb cb;
|
||||
};
|
||||
|
||||
struct testcase cases[] = {
|
||||
{ .name = "mov_load_x1()", .cb = mov_load_x1 },
|
||||
{ .name = "mov_load_x2()", .cb = mov_load_x2 },
|
||||
{ .name = "mov_load_x3()", .cb = mov_load_x3 },
|
||||
{ .name = "mov_load_x4()", .cb = mov_load_x4 },
|
||||
};
|
||||
struct testcase cases[] = {
|
||||
{ .name = "mov_load_x1()", .cb = mov_load_x1 },
|
||||
{ .name = "mov_load_x2()", .cb = mov_load_x2 },
|
||||
{ .name = "mov_load_x3()", .cb = mov_load_x3 },
|
||||
{ .name = "mov_load_x4()", .cb = mov_load_x4 },
|
||||
};
|
||||
|
||||
int main() {
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
@ -54,7 +54,14 @@ static void read_malloc_file_with_read(bool should_alloc, struct repetitor *repe
|
||||
free_buffer(should_alloc, buffer);
|
||||
}
|
||||
|
||||
int main_test_malloc_read(char *filename) {
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
printf("Usage: %s <filename>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *filename = argv[1];
|
||||
|
||||
typedef void (*read_file_b)(bool should_alloc, struct repetitor *repetitor, uint8_t *buffer, uint64_t buffer_size, char *filename);
|
||||
struct testcase {
|
||||
char *name;
|
@ -46,7 +46,15 @@ void read_file_with_read(struct repetitor *repetitor, uint8_t *buffer, uint64_t
|
||||
close(file);
|
||||
}
|
||||
|
||||
int main_test_read_file(char *filename) {
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
printf("Usage: %s <filename>\n", argv[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *filename = argv[1];
|
||||
|
||||
|
||||
typedef void (*read_file_b)(struct repetitor *repetitor, uint8_t *buffer, uint64_t buffer_size, char *filename);
|
||||
struct testcase {
|
||||
char *name;
|
@ -1,7 +1,7 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "read_widths.h"
|
||||
|
||||
int main_test_read_widths() {
|
||||
int main() {
|
||||
typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
@ -1,7 +1,7 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "short_load_uop.h"
|
||||
|
||||
int main_test_short_load_uop() {
|
||||
int main() {
|
||||
typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
@ -1,7 +1,7 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "store_uop.h"
|
||||
|
||||
int main_test_store_uop() {
|
||||
int main() {
|
||||
typedef void (*test_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
@ -19,7 +19,7 @@ static void test_write_to_all_bytes(struct repetitor *repetitor, uint8_t *buffer
|
||||
}
|
||||
}
|
||||
|
||||
int main_test_write_all_bytes() {
|
||||
int main() {
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
@ -1,6 +1,6 @@
|
||||
#include "repetition_tester.c"
|
||||
|
||||
int main_test_write_backward() {
|
||||
int main() {
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
@ -1,7 +1,7 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "multi_nop_loop.h"
|
||||
|
||||
int main_test_write_bytes_asm() {
|
||||
int main() {
|
||||
typedef void (*write_bytes_cb)(uint8_t *buffer, uint64_t byte_count);
|
||||
struct testcase {
|
||||
char *name;
|
@ -1,7 +1,7 @@
|
||||
#include "repetition_tester.c"
|
||||
#include "write_loops.h"
|
||||
|
||||
int main_test_write_loop() {
|
||||
int main() {
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
Loading…
Reference in New Issue
Block a user