From fbc02c6ff8331a3d9177dc651fe0019478e1a45e Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Sun, 23 Jun 2024 17:42:53 +0300 Subject: [PATCH] add manual prefetching test --- src/15_manual_prefetch/main.c | 138 ++++++++++++++++++++++++++++ src/15_manual_prefetch/prefetch.asm | 113 +++++++++++++++++++++++ src/repetition_tester.c | 2 +- 3 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 src/15_manual_prefetch/main.c create mode 100644 src/15_manual_prefetch/prefetch.asm diff --git a/src/15_manual_prefetch/main.c b/src/15_manual_prefetch/main.c new file mode 100644 index 0000000..b2261dc --- /dev/null +++ b/src/15_manual_prefetch/main.c @@ -0,0 +1,138 @@ +#include "repetition_tester.c" +#include +#include + +// I wasn't able to complete this homework. +// Failed to create a case where prefetch instruction helped. +// +// Now that I think about it, it is probably because my "hot loop" when +// processing the data is memory bound and not CPU bound. idk + +#define NODE_DATA_SIZE 32 + +static uint64_t kibibytes(uint64_t count) { + return 1024 * count; +} + +static uint64_t mibibytes(uint64_t count) { + return 1024 * kibibytes(count); +} + +static uint64_t gibibytes(uint64_t count) { + return 1024 * mibibytes(count); +} + +struct list_node { + struct list_node *next; + uint32_t data[NODE_DATA_SIZE]; +}; + +typedef uint64_t (*process_fn)(struct list_node *start, uint64_t data_size); + +uint64_t process_data(struct list_node *start, uint64_t data_size); +uint64_t process_data_prefetched0(struct list_node *start, uint64_t data_size); +uint64_t process_data_prefetched1(struct list_node *start, uint64_t data_size); +uint64_t process_data_prefetched2(struct list_node *start, uint64_t data_size); + +static void remove_from_list(int *list, size_t *count, size_t index) +{ + (*count)--; + for (int i = index; i < *count; i++) { + list[i] = list[i+1]; + } +} + +int main() { + uint64_t buffer_size = gibibytes(1); + if (buffer_size % 4096) { + printf("ERROR: Size of buffer is not page aligned\n"); + return -1; + } + + uint8_t *buffer = mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buffer == NULL) { + printf("ERROR: Failed to allocate buffer\n"); + return -1; + } + + struct repetitor repetitor = {}; + repetitor_init(&repetitor); + printf("CPU Frequency: %ldHz (~%.2fGHz)\n", repetitor.cpu_freq, (float)repetitor.cpu_freq/(1000*1000*1000)); + + uint64_t list_size = 1024; + struct list_node *node_pool = (struct list_node *)buffer; + memset(node_pool, 0, sizeof(struct list_node) * list_size); + + bool allocate_nodes_linearly = false; + struct list_node *start = &node_pool[0]; + + if (allocate_nodes_linearly) { + struct list_node *current = start; + for (int i = 0; i < list_size - 1; i++) { + node_pool[i].next = &node_pool[i+1]; + } + } else { + srand(time(NULL)); + + int *free_ids = calloc(list_size, sizeof(int)); + size_t free_id_count = list_size; + assert(free_ids); + for (int i = 0; i < list_size; i++) { + free_ids[i] = i; + } + + remove_from_list(free_ids, &free_id_count, 0); + struct list_node *current = start; + while (free_id_count > 0) { + int free_id_index = rand() % free_id_count; + int free_id = free_ids[free_id_index]; + remove_from_list(free_ids, &free_id_count, free_id_index); + current->next = &node_pool[free_id]; + current = &node_pool[free_id]; + // printf("assign %d\n", free_id); + } + } + + { // Initialize node data + size_t current_value = 0; + struct list_node *current = start; + while (current) { + for (int i = 0; i < NODE_DATA_SIZE; i++) { + current->data[i] = current_value; + current_value++; + } + current = current->next; + } + } + + int max_value = list_size * NODE_DATA_SIZE; + int expected_result = max_value * (max_value-1) / 2; + + struct { + const char *name; + process_fn fn; + } cases[] = { + { .name = "no prefetch", .fn = process_data }, + { .name = "prefetcht0", .fn = process_data_prefetched0 }, + { .name = "prefetcht1", .fn = process_data_prefetched1 }, + { .name = "prefetcht2", .fn = process_data_prefetched2 } + }; + + for (int i = 0; i < ARRAY_LEN(cases); i++) { + process_fn process_fn = cases[i].fn; + + repetitor_clear(&repetitor); + while (repetitor_repeat(&repetitor, 5)) { + repetitor_start(&repetitor); + repetitor_measure_start(&repetitor); + int result = process_fn(start, NODE_DATA_SIZE); + repetitor_measure_stop(&repetitor, NODE_DATA_SIZE * list_size); + repetitor_stop(&repetitor); + // TODO: For some f-ing reasong doing an assert here destroys the performance for the "no prefetch" case, WHYYYYYYYYYYY???? + // Wierd stuff + // assert(result == expected_result); + } + + repetitor_print_results_label(&repetitor, cases[i].name); + } +} diff --git a/src/15_manual_prefetch/prefetch.asm b/src/15_manual_prefetch/prefetch.asm new file mode 100644 index 0000000..4f3b668 --- /dev/null +++ b/src/15_manual_prefetch/prefetch.asm @@ -0,0 +1,113 @@ +global process_data +global process_data_prefetched0 +global process_data_prefetched1 +global process_data_prefetched2 + +section .text + +; rdi - linked list start +; rsi - node data size +process_data: + mov eax, 0 + + .loop: + ; rcx - Data array loop counter + ; rdx - Read location + ; r8 - accumelated sum + mov rdx, rdi + add rdx, 8 + mov rcx, rsi + mov r8, 0 + .sum_loop: + add r8d, [rdx] + add rdx, 4 + sub rcx, 1 + jnz .sum_loop + + add eax, r8d + mov rdi, [rdi] + test rdi, rdi + jnz .loop + + ret + +; rdi - linked list start +; rsi - node data size +process_data_prefetched0: + mov eax, 0 + + .loop: + prefetcht0 [rdi + rsi*4 + 8] + ; rcx - Data array loop counter + ; rdx - Read location + ; r8 - accumelated sum + mov rdx, rdi + add rdx, 8 + mov rcx, rsi + mov r8, 0 + .sum_loop: + add r8d, [rdx] + add rdx, 4 + sub rcx, 1 + jnz .sum_loop + + add eax, r8d + mov rdi, [rdi] + test rdi, rdi + jnz .loop + + ret + +; rdi - linked list start +; rsi - node data size +process_data_prefetched1: + mov eax, 0 + + .loop: + prefetcht1 [rdi + rsi*4 + 8] + ; rcx - Data array loop counter + ; rdx - Read location + ; r8 - accumelated sum + mov rdx, rdi + add rdx, 8 + mov rcx, rsi + mov r8, 0 + .sum_loop: + add r8d, [rdx] + add rdx, 4 + sub rcx, 1 + jnz .sum_loop + + add eax, r8d + mov rdi, [rdi] + test rdi, rdi + jnz .loop + + ret + +; rdi - linked list start +; rsi - node data size +process_data_prefetched2: + mov eax, 0 + + .loop: + prefetcht2 [rdi + rsi*4 + 8] + ; rcx - Data array loop counter + ; rdx - Read location + ; r8 - accumelated sum + mov rdx, rdi + add rdx, 8 + mov rcx, rsi + mov r8, 0 + .sum_loop: + add r8d, [rdx] + add rdx, 4 + sub rcx, 1 + jnz .sum_loop + + add eax, r8d + mov rdi, [rdi] + test rdi, rdi + jnz .loop + + ret diff --git a/src/repetition_tester.c b/src/repetition_tester.c index 7e668ff..70be751 100644 --- a/src/repetition_tester.c +++ b/src/repetition_tester.c @@ -198,7 +198,7 @@ void repetitor_print_results(struct repetitor *repetitor) { printf("Worst : %16ld : %10.6f : %12ld : %18.6f : %12ld : %18.6f :\n", max_time_taken, cycles_to_ms(repetitor, max_time_taken), max_byte_count, max_bandwidth, max_page_faults, max_kb_per_fault); } -void repetitor_print_results_label(struct repetitor *repetitor, char *label) { +void repetitor_print_results_label(struct repetitor *repetitor, const char *label) { printf("--------- %s ---------\n", label); repetitor_print_results(repetitor); }