1
0

add manual prefetching test

This commit is contained in:
Rokas Puzonas 2024-06-23 17:42:53 +03:00
parent 0c0a4b6975
commit fbc02c6ff8
3 changed files with 252 additions and 1 deletions

View File

@ -0,0 +1,138 @@
#include "repetition_tester.c"
#include <stdio.h>
#include <sys/mman.h>
// I wasn't able to complete this homework.
// Failed to create a case where prefetch instruction helped.
//
// Now that I think about it, it is probably because my "hot loop" when
// processing the data is memory bound and not CPU bound. idk
#define NODE_DATA_SIZE 32
static uint64_t kibibytes(uint64_t count) {
return 1024 * count;
}
static uint64_t mibibytes(uint64_t count) {
return 1024 * kibibytes(count);
}
static uint64_t gibibytes(uint64_t count) {
return 1024 * mibibytes(count);
}
struct list_node {
struct list_node *next;
uint32_t data[NODE_DATA_SIZE];
};
typedef uint64_t (*process_fn)(struct list_node *start, uint64_t data_size);
uint64_t process_data(struct list_node *start, uint64_t data_size);
uint64_t process_data_prefetched0(struct list_node *start, uint64_t data_size);
uint64_t process_data_prefetched1(struct list_node *start, uint64_t data_size);
uint64_t process_data_prefetched2(struct list_node *start, uint64_t data_size);
static void remove_from_list(int *list, size_t *count, size_t index)
{
(*count)--;
for (int i = index; i < *count; i++) {
list[i] = list[i+1];
}
}
int main() {
uint64_t buffer_size = gibibytes(1);
if (buffer_size % 4096) {
printf("ERROR: Size of buffer is not page aligned\n");
return -1;
}
uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buffer == NULL) {
printf("ERROR: Failed to allocate buffer\n");
return -1;
}
struct repetitor repetitor = {};
repetitor_init(&repetitor);
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
uint64_t list_size = 1024;
struct list_node *node_pool = (struct list_node *)buffer;
memset(node_pool, 0, sizeof(struct list_node) * list_size);
bool allocate_nodes_linearly = false;
struct list_node *start = &node_pool[0];
if (allocate_nodes_linearly) {
struct list_node *current = start;
for (int i = 0; i < list_size - 1; i++) {
node_pool[i].next = &node_pool[i+1];
}
} else {
srand(time(NULL));
int *free_ids = calloc(list_size, sizeof(int));
size_t free_id_count = list_size;
assert(free_ids);
for (int i = 0; i < list_size; i++) {
free_ids[i] = i;
}
remove_from_list(free_ids, &free_id_count, 0);
struct list_node *current = start;
while (free_id_count > 0) {
int free_id_index = rand() % free_id_count;
int free_id = free_ids[free_id_index];
remove_from_list(free_ids, &free_id_count, free_id_index);
current->next = &node_pool[free_id];
current = &node_pool[free_id];
// printf("assign %d\n", free_id);
}
}
{ // Initialize node data
size_t current_value = 0;
struct list_node *current = start;
while (current) {
for (int i = 0; i < NODE_DATA_SIZE; i++) {
current->data[i] = current_value;
current_value++;
}
current = current->next;
}
}
int max_value = list_size * NODE_DATA_SIZE;
int expected_result = max_value * (max_value-1) / 2;
struct {
const char *name;
process_fn fn;
} cases[] = {
{ .name = "no prefetch", .fn = process_data },
{ .name = "prefetcht0", .fn = process_data_prefetched0 },
{ .name = "prefetcht1", .fn = process_data_prefetched1 },
{ .name = "prefetcht2", .fn = process_data_prefetched2 }
};
for (int i = 0; i < ARRAY_LEN(cases); i++) {
process_fn process_fn = cases[i].fn;
repetitor_clear(&repetitor);
while (repetitor_repeat(&repetitor, 5)) {
repetitor_start(&repetitor);
repetitor_measure_start(&repetitor);
int result = process_fn(start, NODE_DATA_SIZE);
repetitor_measure_stop(&repetitor, NODE_DATA_SIZE * list_size);
repetitor_stop(&repetitor);
// TODO: For some f-ing reasong doing an assert here destroys the performance for the "no prefetch" case, WHYYYYYYYYYYY????
// Wierd stuff
// assert(result == expected_result);
}
repetitor_print_results_label(&repetitor, cases[i].name);
}
}

View File

@ -0,0 +1,113 @@
global process_data
global process_data_prefetched0
global process_data_prefetched1
global process_data_prefetched2
section .text
; rdi - linked list start
; rsi - node data size
process_data:
mov eax, 0
.loop:
; rcx - Data array loop counter
; rdx - Read location
; r8 - accumelated sum
mov rdx, rdi
add rdx, 8
mov rcx, rsi
mov r8, 0
.sum_loop:
add r8d, [rdx]
add rdx, 4
sub rcx, 1
jnz .sum_loop
add eax, r8d
mov rdi, [rdi]
test rdi, rdi
jnz .loop
ret
; rdi - linked list start
; rsi - node data size
process_data_prefetched0:
mov eax, 0
.loop:
prefetcht0 [rdi + rsi*4 + 8]
; rcx - Data array loop counter
; rdx - Read location
; r8 - accumelated sum
mov rdx, rdi
add rdx, 8
mov rcx, rsi
mov r8, 0
.sum_loop:
add r8d, [rdx]
add rdx, 4
sub rcx, 1
jnz .sum_loop
add eax, r8d
mov rdi, [rdi]
test rdi, rdi
jnz .loop
ret
; rdi - linked list start
; rsi - node data size
process_data_prefetched1:
mov eax, 0
.loop:
prefetcht1 [rdi + rsi*4 + 8]
; rcx - Data array loop counter
; rdx - Read location
; r8 - accumelated sum
mov rdx, rdi
add rdx, 8
mov rcx, rsi
mov r8, 0
.sum_loop:
add r8d, [rdx]
add rdx, 4
sub rcx, 1
jnz .sum_loop
add eax, r8d
mov rdi, [rdi]
test rdi, rdi
jnz .loop
ret
; rdi - linked list start
; rsi - node data size
process_data_prefetched2:
mov eax, 0
.loop:
prefetcht2 [rdi + rsi*4 + 8]
; rcx - Data array loop counter
; rdx - Read location
; r8 - accumelated sum
mov rdx, rdi
add rdx, 8
mov rcx, rsi
mov r8, 0
.sum_loop:
add r8d, [rdx]
add rdx, 4
sub rcx, 1
jnz .sum_loop
add eax, r8d
mov rdi, [rdi]
test rdi, rdi
jnz .loop
ret

View File

@ -198,7 +198,7 @@ void repetitor_print_results(struct repetitor *repetitor) {
printf("Worst : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", max_time_taken, cycles_to_ms(repetitor, max_time_taken), max_byte_count, max_bandwidth, max_page_faults, max_kb_per_fault);
}
void repetitor_print_results_label(struct repetitor *repetitor, char *label) {
void repetitor_print_results_label(struct repetitor *repetitor, const char *label) {
printf("--------- %s ---------\n", label);
repetitor_print_results(repetitor);
}