add manual prefetching test
This commit is contained in:
parent
0c0a4b6975
commit
fbc02c6ff8
138
src/15_manual_prefetch/main.c
Normal file
138
src/15_manual_prefetch/main.c
Normal file
@ -0,0 +1,138 @@
|
||||
#include "repetition_tester.c"
|
||||
#include <stdio.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
// I wasn't able to complete this homework.
|
||||
// Failed to create a case where prefetch instruction helped.
|
||||
//
|
||||
// Now that I think about it, it is probably because my "hot loop" when
|
||||
// processing the data is memory bound and not CPU bound. idk
|
||||
|
||||
#define NODE_DATA_SIZE 32
|
||||
|
||||
static uint64_t kibibytes(uint64_t count) {
|
||||
return 1024 * count;
|
||||
}
|
||||
|
||||
static uint64_t mibibytes(uint64_t count) {
|
||||
return 1024 * kibibytes(count);
|
||||
}
|
||||
|
||||
static uint64_t gibibytes(uint64_t count) {
|
||||
return 1024 * mibibytes(count);
|
||||
}
|
||||
|
||||
struct list_node {
|
||||
struct list_node *next;
|
||||
uint32_t data[NODE_DATA_SIZE];
|
||||
};
|
||||
|
||||
typedef uint64_t (*process_fn)(struct list_node *start, uint64_t data_size);
|
||||
|
||||
uint64_t process_data(struct list_node *start, uint64_t data_size);
|
||||
uint64_t process_data_prefetched0(struct list_node *start, uint64_t data_size);
|
||||
uint64_t process_data_prefetched1(struct list_node *start, uint64_t data_size);
|
||||
uint64_t process_data_prefetched2(struct list_node *start, uint64_t data_size);
|
||||
|
||||
static void remove_from_list(int *list, size_t *count, size_t index)
|
||||
{
|
||||
(*count)--;
|
||||
for (int i = index; i < *count; i++) {
|
||||
list[i] = list[i+1];
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
uint64_t buffer_size = gibibytes(1);
|
||||
if (buffer_size % 4096) {
|
||||
printf("ERROR: Size of buffer is not page aligned\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (buffer == NULL) {
|
||||
printf("ERROR: Failed to allocate buffer\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct repetitor repetitor = {};
|
||||
repetitor_init(&repetitor);
|
||||
printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000));
|
||||
|
||||
uint64_t list_size = 1024;
|
||||
struct list_node *node_pool = (struct list_node *)buffer;
|
||||
memset(node_pool, 0, sizeof(struct list_node) * list_size);
|
||||
|
||||
bool allocate_nodes_linearly = false;
|
||||
struct list_node *start = &node_pool[0];
|
||||
|
||||
if (allocate_nodes_linearly) {
|
||||
struct list_node *current = start;
|
||||
for (int i = 0; i < list_size - 1; i++) {
|
||||
node_pool[i].next = &node_pool[i+1];
|
||||
}
|
||||
} else {
|
||||
srand(time(NULL));
|
||||
|
||||
int *free_ids = calloc(list_size, sizeof(int));
|
||||
size_t free_id_count = list_size;
|
||||
assert(free_ids);
|
||||
for (int i = 0; i < list_size; i++) {
|
||||
free_ids[i] = i;
|
||||
}
|
||||
|
||||
remove_from_list(free_ids, &free_id_count, 0);
|
||||
struct list_node *current = start;
|
||||
while (free_id_count > 0) {
|
||||
int free_id_index = rand() % free_id_count;
|
||||
int free_id = free_ids[free_id_index];
|
||||
remove_from_list(free_ids, &free_id_count, free_id_index);
|
||||
current->next = &node_pool[free_id];
|
||||
current = &node_pool[free_id];
|
||||
// printf("assign %d\n", free_id);
|
||||
}
|
||||
}
|
||||
|
||||
{ // Initialize node data
|
||||
size_t current_value = 0;
|
||||
struct list_node *current = start;
|
||||
while (current) {
|
||||
for (int i = 0; i < NODE_DATA_SIZE; i++) {
|
||||
current->data[i] = current_value;
|
||||
current_value++;
|
||||
}
|
||||
current = current->next;
|
||||
}
|
||||
}
|
||||
|
||||
int max_value = list_size * NODE_DATA_SIZE;
|
||||
int expected_result = max_value * (max_value-1) / 2;
|
||||
|
||||
struct {
|
||||
const char *name;
|
||||
process_fn fn;
|
||||
} cases[] = {
|
||||
{ .name = "no prefetch", .fn = process_data },
|
||||
{ .name = "prefetcht0", .fn = process_data_prefetched0 },
|
||||
{ .name = "prefetcht1", .fn = process_data_prefetched1 },
|
||||
{ .name = "prefetcht2", .fn = process_data_prefetched2 }
|
||||
};
|
||||
|
||||
for (int i = 0; i < ARRAY_LEN(cases); i++) {
|
||||
process_fn process_fn = cases[i].fn;
|
||||
|
||||
repetitor_clear(&repetitor);
|
||||
while (repetitor_repeat(&repetitor, 5)) {
|
||||
repetitor_start(&repetitor);
|
||||
repetitor_measure_start(&repetitor);
|
||||
int result = process_fn(start, NODE_DATA_SIZE);
|
||||
repetitor_measure_stop(&repetitor, NODE_DATA_SIZE * list_size);
|
||||
repetitor_stop(&repetitor);
|
||||
// TODO: For some f-ing reasong doing an assert here destroys the performance for the "no prefetch" case, WHYYYYYYYYYYY????
|
||||
// Wierd stuff
|
||||
// assert(result == expected_result);
|
||||
}
|
||||
|
||||
repetitor_print_results_label(&repetitor, cases[i].name);
|
||||
}
|
||||
}
|
113
src/15_manual_prefetch/prefetch.asm
Normal file
113
src/15_manual_prefetch/prefetch.asm
Normal file
@ -0,0 +1,113 @@
|
||||
global process_data
|
||||
global process_data_prefetched0
|
||||
global process_data_prefetched1
|
||||
global process_data_prefetched2
|
||||
|
||||
section .text
|
||||
|
||||
; rdi - linked list start
|
||||
; rsi - node data size
|
||||
process_data:
|
||||
mov eax, 0
|
||||
|
||||
.loop:
|
||||
; rcx - Data array loop counter
|
||||
; rdx - Read location
|
||||
; r8 - accumelated sum
|
||||
mov rdx, rdi
|
||||
add rdx, 8
|
||||
mov rcx, rsi
|
||||
mov r8, 0
|
||||
.sum_loop:
|
||||
add r8d, [rdx]
|
||||
add rdx, 4
|
||||
sub rcx, 1
|
||||
jnz .sum_loop
|
||||
|
||||
add eax, r8d
|
||||
mov rdi, [rdi]
|
||||
test rdi, rdi
|
||||
jnz .loop
|
||||
|
||||
ret
|
||||
|
||||
; rdi - linked list start
|
||||
; rsi - node data size
|
||||
process_data_prefetched0:
|
||||
mov eax, 0
|
||||
|
||||
.loop:
|
||||
prefetcht0 [rdi + rsi*4 + 8]
|
||||
; rcx - Data array loop counter
|
||||
; rdx - Read location
|
||||
; r8 - accumelated sum
|
||||
mov rdx, rdi
|
||||
add rdx, 8
|
||||
mov rcx, rsi
|
||||
mov r8, 0
|
||||
.sum_loop:
|
||||
add r8d, [rdx]
|
||||
add rdx, 4
|
||||
sub rcx, 1
|
||||
jnz .sum_loop
|
||||
|
||||
add eax, r8d
|
||||
mov rdi, [rdi]
|
||||
test rdi, rdi
|
||||
jnz .loop
|
||||
|
||||
ret
|
||||
|
||||
; rdi - linked list start
|
||||
; rsi - node data size
|
||||
process_data_prefetched1:
|
||||
mov eax, 0
|
||||
|
||||
.loop:
|
||||
prefetcht1 [rdi + rsi*4 + 8]
|
||||
; rcx - Data array loop counter
|
||||
; rdx - Read location
|
||||
; r8 - accumelated sum
|
||||
mov rdx, rdi
|
||||
add rdx, 8
|
||||
mov rcx, rsi
|
||||
mov r8, 0
|
||||
.sum_loop:
|
||||
add r8d, [rdx]
|
||||
add rdx, 4
|
||||
sub rcx, 1
|
||||
jnz .sum_loop
|
||||
|
||||
add eax, r8d
|
||||
mov rdi, [rdi]
|
||||
test rdi, rdi
|
||||
jnz .loop
|
||||
|
||||
ret
|
||||
|
||||
; rdi - linked list start
|
||||
; rsi - node data size
|
||||
process_data_prefetched2:
|
||||
mov eax, 0
|
||||
|
||||
.loop:
|
||||
prefetcht2 [rdi + rsi*4 + 8]
|
||||
; rcx - Data array loop counter
|
||||
; rdx - Read location
|
||||
; r8 - accumelated sum
|
||||
mov rdx, rdi
|
||||
add rdx, 8
|
||||
mov rcx, rsi
|
||||
mov r8, 0
|
||||
.sum_loop:
|
||||
add r8d, [rdx]
|
||||
add rdx, 4
|
||||
sub rcx, 1
|
||||
jnz .sum_loop
|
||||
|
||||
add eax, r8d
|
||||
mov rdi, [rdi]
|
||||
test rdi, rdi
|
||||
jnz .loop
|
||||
|
||||
ret
|
@ -198,7 +198,7 @@ void repetitor_print_results(struct repetitor *repetitor) {
|
||||
printf("Worst : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", max_time_taken, cycles_to_ms(repetitor, max_time_taken), max_byte_count, max_bandwidth, max_page_faults, max_kb_per_fault);
|
||||
}
|
||||
|
||||
void repetitor_print_results_label(struct repetitor *repetitor, char *label) {
|
||||
void repetitor_print_results_label(struct repetitor *repetitor, const char *label) {
|
||||
printf("--------- %s ---------\n", label);
|
||||
repetitor_print_results(repetitor);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user