From 995d41c1903d8036d2684d29f0654e588e04edd1 Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Mon, 31 Jul 2023 01:20:39 +0300 Subject: [PATCH] cleanup computation of local boids --- Makefile | 2 +- src/boid-list.cpp | 13 +-- src/boid-list.hpp | 12 +- src/main.cpp | 8 +- src/raycast.cpp | 2 +- src/raycast.hpp | 2 +- src/ui.cpp | 6 +- src/world.cpp | 278 +++++++++++++++++++++++++--------------------- 8 files changed, 174 insertions(+), 149 deletions(-) diff --git a/Makefile b/Makefile index 457edfa..c6743e1 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ WEB_HEAP_SIZE := 335544320 WEB_STACK_SIZE := 196608 WEB_SHELL := src/shell.html -COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g +COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g -flto LINKER_FLAGS := -lraylib # SOURCES := $(wildcard src/*.cpp) diff --git a/src/boid-list.cpp b/src/boid-list.cpp index 77641e4..ee004d2 100644 --- a/src/boid-list.cpp +++ b/src/boid-list.cpp @@ -2,17 +2,17 @@ #include "boid-list.hpp" -static void boid_list_init(BoidList *list) +void boid_list_init(BoidList *list) { list->node.next = NULL; list->count = 0; } -static BoidsListNodeIterator boid_list_get_iterator(BoidList *list) { +BoidsListNodeIterator boid_list_get_iterator(BoidList *list) { return { .count = list->count, .i = 0, .node = &list->node }; } -static bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value) { +bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value) { if (iterator->count == 0) { return false; } @@ -28,7 +28,7 @@ static bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *va return true; } -static void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid) { +void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid) { int left_count = list->count; BoidListNode *prev = &list->node; BoidListNode *curr = &list->node; @@ -48,7 +48,7 @@ static void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boi list->count++; } -static void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid) { +void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid) { int left_count = list->count; BoidListNode *last = &list->node; BoidListNode *curr = &list->node; @@ -72,8 +72,7 @@ static void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t list->count++; } -static void boid_list_to_array(uboid_t *result, BoidList *list) -{ +void boid_list_to_array(uboid_t *result, BoidList *list) { int i = 0; uboid_t boid; BoidsListNodeIterator it = boid_list_get_iterator(list); diff --git a/src/boid-list.hpp b/src/boid-list.hpp index 60b07f3..d4cafab 100644 --- a/src/boid-list.hpp +++ b/src/boid-list.hpp @@ -22,11 +22,11 @@ struct BoidsListNodeIterator { BoidListNode *node; }; -static BoidsListNodeIterator boid_list_get_iterator(BoidList *list); -static bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value); +BoidsListNodeIterator boid_list_get_iterator(BoidList *list); +bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value); -static void boid_list_init(BoidList *list); -static void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid); -static void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid); +void boid_list_init(BoidList *list); +void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid); +void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid); -static void boid_list_to_array(uboid_t *result, BoidList *list); +void boid_list_to_array(uboid_t *result, BoidList *list); diff --git a/src/main.cpp b/src/main.cpp index 9988847..24b3ee8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -54,7 +54,7 @@ int main() { world_init(&g_world, screen_width, screen_height); float border = g_world.collision_avoidance_distance; - for (int i = 0; i < 10000; i++) { + for (int i = 0; i < 33000; i++) { Boid boid; boid_rand_init(&g_world, &boid, border); g_world.boids.push_back(boid); @@ -102,9 +102,9 @@ static void profiling_test() { } rprof_end(); - printf("interactions: %d\n", interactions); - if (interactions != 33119854) { // 22 051 739 - printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", interactions - 33119854); + printf("interactions: %d\n", g_prof_interactions); + if (g_prof_interactions != 33119854) { // 22 051 739 + printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", g_prof_interactions - 33119854); } rprof_output(NULL); diff --git a/src/raycast.cpp b/src/raycast.cpp index abe6166..be27707 100644 --- a/src/raycast.cpp +++ b/src/raycast.cpp @@ -47,7 +47,7 @@ static void get_intersect_with_obstacles(RayHitResult *result, Vector2 ray_origi } } -static void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world) { +void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world) { get_intersect_with_obstacles(result, ray_origin, ray_dir, &world->obstacles); if (result->hit == -1 && !world->looping_walls) { diff --git a/src/raycast.hpp b/src/raycast.hpp index f815502..8c84d08 100644 --- a/src/raycast.hpp +++ b/src/raycast.hpp @@ -13,4 +13,4 @@ static float get_intersect_point(Vector2 ray_origin, Vector2 ray_dir, Vector2 li static void set_nearest_hit(RayHitResult *nearest_hit, float hit, Vector2 line1, Vector2 line2); static void get_intersect_with_polygon(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, Vector2 *points, int point_count); static void get_intersect_with_obstacles(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, std::vector *obstacles); -static void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world); +void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world); diff --git a/src/ui.cpp b/src/ui.cpp index 7cb10cb..19d8c39 100644 --- a/src/ui.cpp +++ b/src/ui.cpp @@ -6,13 +6,11 @@ struct VerticalLayout { float gap; }; -static Rectangle rect_with_offset(Rectangle rect, Vector2 offset) -{ +static Rectangle rect_with_offset(Rectangle rect, Vector2 offset) { return { rect.x + offset.x, rect.y + offset.y, rect.width, rect.height }; } -static Rectangle rect_with_offset(Rectangle rect, float x, float y) -{ +static Rectangle rect_with_offset(Rectangle rect, float x, float y) { return { rect.x + x, rect.y + y, rect.width, rect.height }; } diff --git a/src/world.cpp b/src/world.cpp index cc2b48a..1662f27 100644 --- a/src/world.cpp +++ b/src/world.cpp @@ -125,24 +125,49 @@ static void world_free(World *world) { // --------------------- Update ----------------------- -static int interactions = 0; +struct ChunkGrid { + BoidList *data; + int width; + int height; +}; -static int nearest_multiple(int num, int divisor) -{ +static BoidList *chunkgrid_get(ChunkGrid *grid, int x, int y) { + return &grid->data[y * grid->width + x]; +} + +static void chunkgrid_init(MemoryArena *arena, ChunkGrid *grid, int width, int height) { + grid->data = (BoidList*)arena_malloc(arena, width * height * sizeof(BoidList)); + grid->width = width; + grid->height = height; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + boid_list_init(chunkgrid_get(grid, x, y)); + } + } +} + +static int g_prof_interactions = 0; + +static int nearest_multiple(int num, int divisor) { return (num / divisor + (num % divisor > 0 ? 1 : 0)) * divisor; } -// b2b = boid to boid comparison -static void assign_local_boids_b2b(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t to_boid, Vector2 offset, float length_sqr) -{ - assert(to_boid != from_boid); +// b2l = boid to (list of boids) comparison +static void assign_local_boids_b2l(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t *to_boids, uboid_t to_boids_count) { + Boid *boids = world->boids.data(); // Simplified from: float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2)); float dot_threshold = cosf(world->view_angle/2); + float view_radius_sqr = world->view_radius * world->view_radius; - bool with_in_range = length_sqr <= (world->view_radius * world->view_radius); - if (with_in_range) { - interactions++; + for (int i = 0; i < to_boids_count; i++) { + uint16_t to_boid = to_boids[i]; + assert(to_boid != from_boid); + + Vector2 offset = Vector2Subtract(boids[from_boid].pos, boids[to_boid].pos); + float length_sqr = Vector2LengthSqr(offset); + if (length_sqr > view_radius_sqr) continue; Vector2 normalized = offset; if (length_sqr != 0) @@ -152,52 +177,19 @@ static void assign_local_boids_b2b(World *world, BoidList *local_boids, uboid_t normalized.y *= ilength; } - // printf("----\n"); - // printf("boid:%d->%d, lengths_sqr:%f, offset:(%f,%f), look:(%f,%f)\n", from_boid, to_boid, lengths_sqr, offset.x, offset.y, boids[from_boid].dir.x, boids[from_boid].dir.y); - - Boid *boids = world->boids.data(); if (Vector2DotProduct(boids[from_boid].dir, Vector2Negate(normalized)) >= dot_threshold) { boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid); + g_prof_interactions++; } if (Vector2DotProduct(boids[to_boid].dir, normalized) >= dot_threshold) { boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid); + g_prof_interactions++; } } } -// b2l = boid to (list of boids) comparison -static void assign_local_boids_b2l(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t *to_boids, uboid_t to_boids_count) -{ - Boid *boids = world->boids.data(); - int to_boids_count_8 = nearest_multiple(to_boids_count, 8); - - Vector2 to_positions[to_boids_count_8]; - for (int i = 0; i < to_boids_count; i++) { - to_positions[i] = boids[to_boids[i]].pos; - } - - // Vector2 offsets[to_boids_count_8]; - // vector2_sub_simd8(offsets, boids[from_boid].pos, to_positions, to_boids_count_8); - - // float lengths_sqrs[to_boids_count_8]; - // vector2_length_sqr_simd8(lengths_sqrs, offsets, to_boids_count_8); - - for (int i = 0; i < to_boids_count; i++) { - uint16_t to_boid = to_boids[i]; - - // Vector2 offset = offsets[i]; - // float lengths_sqr = lengths_sqrs[i]; - - Vector2 offset = Vector2Subtract(boids[from_boid].pos, boids[to_boid].pos); - float lengths_sqr = Vector2LengthSqr(offset); - - assign_local_boids_b2b(world, local_boids, from_boid, to_boid, offset, lengths_sqr); - } -} - -static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x, __m256 *vecs_y) -{ - assert(vec_count % 8 == 0 && "Vector2 count must be divisible by 8"); +static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x, __m256 *vecs_y) { + DEBUG_ASSERT(vec_count % 8 == 0 && "Vector2 count must be divisible by 8"); for (int i = 0; i < vec_count/8; i++) { vecs_x[i] = _mm256_set_ps( @@ -225,47 +217,27 @@ static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x, } } -static void world_update(World *world, float dt) { - if (world->freeze) return; +// BUG: functions `world_compute_local_boids_simd` and `world_compute_local_boids_scalar` don't give the +// same result. Investiagte further with profiling. Something related with iterating neighbour chunks. +// Difference isin't huge, but exists even on -O0. +// Also don't know which one is the more correct one. - MemoryArena *arena = &world->frame_arena; - arena_clear(arena); +static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) { + struct b2l_cmp { + uboid_t from; + uboid_t *to_list; + uboid_t to_list_count; + + __m256 *to_list_pos_x; + __m256 *to_list_pos_y; + __m256 *to_list_dir_x; + __m256 *to_list_dir_y; + int to_list_pos_count; + }; Boid *boids = world->boids.data(); int boid_count = world->boids.size(); - - assert(boid_count <= MAX_BOIDS); - - RPROF_START("Alloc groups"); - BoidList *all_local_boids = (BoidList*)arena_malloc(arena, boid_count * sizeof(BoidList)); - for (int i = 0; i < boid_count; i++) { - boid_list_init(&all_local_boids[i]); - } - RPROF_STOP(); - - size_t alloc_chunks = world->frame_arena.offset; - float chunk_size = std::max(world->view_radius, 15.0f); - int chunks_wide = std::ceil(world->size.x / chunk_size) + 1; - int chunks_high = std::ceil(world->size.y / chunk_size) + 1; - RPROF_START("Alloc chunks"); - BoidList *chunks[chunks_high][chunks_wide]; - for (int y = 0; y < chunks_high; y++) { - for (int x = 0; x < chunks_wide; x++) { - chunks[y][x] = (BoidList*)arena_malloc(arena, sizeof(BoidList)); - boid_list_init(chunks[y][x]); - } - } - RPROF_STOP(); - - RPROF_START("Creating chunks"); - for (int i = 0; i < boid_count; i++) { - Boid *boid = &boids[i]; - int chunk_x = boid->pos.x / chunk_size; - int chunk_y = boid->pos.y / chunk_size; - - boid_list_append(arena, chunks[chunk_y][chunk_x], i); - } - RPROF_STOP(); + MemoryArena *arena = &world->frame_arena; RPROF_START("Extracting boid positions"); Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); @@ -276,31 +248,21 @@ static void world_update(World *world, float dt) { } RPROF_STOP(); + // Simplified from: + // float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2)); + // | + // v + // float dot_threshold = cosf(world->view_angle/2); + __m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2)); - int chunk_cmps = 0; - RPROF_START("Calc dot products and ranges (chunked)"); + RPROF_START("Calc dot products and ranges (simd)"); // TODO: Use temp memory arena inside this profile block - // int32_t *in_range_mask_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32); int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32); int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32); - for (int y = 0; y < chunks_high; y++) { - - Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } }; - struct b2l_cmp { - uboid_t from; - uboid_t *to_list; - uboid_t to_list_count; - - __m256 *to_list_pos_x; - __m256 *to_list_pos_y; - __m256 *to_list_dir_x; - __m256 *to_list_dir_y; - int to_list_pos_count; - }; - - for (int x = 0; x < chunks_wide; x++) { - BoidList *chunk = chunks[y][x]; + for (int y = 0; y < chunks->height; y++) { + for (int x = 0; x < chunks->width; x++) { + BoidList *chunk = chunkgrid_get(chunks, x, y); if (chunk->count == 0) continue; std::vector b2l_cmps; // TODO: remove usage of std::vec, it is kinda slow @@ -340,13 +302,14 @@ static void world_update(World *world, float dt) { b2l_cmps.push_back(cmp); } + Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } }; for (int i = 0; i < ARRAY_LEN(neighbours); i++) { int chunk_y = y + neighbours[i].y; int chunk_x = x + neighbours[i].x; - if (chunk_y < 0 || chunk_y >= chunks_high) continue; - if (chunk_x < 0 || chunk_x >= chunks_wide) continue; + if (chunk_y < 0 || chunk_y >= chunks->height) continue; + if (chunk_x < 0 || chunk_x >= chunks->width) continue; - BoidList *neighbour_chunk = chunks[chunk_y][chunk_x]; + BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y); if (neighbour_chunk->count == 0) continue; // TODO: alloc 'neighbour_ids' into scratch arena @@ -393,10 +356,6 @@ static void world_update(World *world, float dt) { float view_radius_sqr = world->view_radius * world->view_radius; - // Simplified from: float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2)); - float dot_threshold_single = cosf(world->view_angle/2); - __m256 dot_threshold = _mm256_set1_ps(dot_threshold_single); - __m256 view_radius = _mm256_set1_ps(view_radius_sqr); __m256 from_pos_x = _mm256_set1_ps(from_pos.x); __m256 from_pos_y = _mm256_set1_ps(from_pos.y); @@ -444,34 +403,45 @@ static void world_update(World *world, float dt) { uboid_t to_boid = cmp->to_list[to_boid_idx]; if (do_append_mask1_f32[k]) { - boid_list_append(&world->frame_arena, &all_local_boids[from_boid], to_boid); - interactions++; + boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid); + g_prof_interactions++; } if (do_append_mask2_f32[k]) { - boid_list_append(&world->frame_arena, &all_local_boids[to_boid], from_boid); - interactions++; + boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid); + g_prof_interactions++; } } } } + } + } + RPROF_STOP(); +} + +static void world_compute_local_boids_scalar(BoidList *local_boids, World *world, ChunkGrid *chunks) { + RPROF_START("Calc dot products and ranges (scalar)"); + for (int y = 0; y < chunks->height; y++) { + for (int x = 0; x < chunks->width; x++) { + BoidList *chunk = chunkgrid_get(chunks, x, y); + if (chunk->count == 0) continue; - /* uboid_t chunk_boids[chunk->count]; boid_list_to_array(chunk_boids, chunk); for (int i = 0; i < chunk->count-1; i++) { uboid_t from_boid = chunk_boids[i]; uboid_t *to_boids = &chunk_boids[i+1]; uboid_t to_boids_count = chunk->count-i-1; - assign_local_boids_b2l(world, all_local_boids, from_boid, to_boids, to_boids_count); + assign_local_boids_b2l(world, local_boids, from_boid, to_boids, to_boids_count); } + Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } }; for (int i = 0; i < ARRAY_LEN(neighbours); i++) { int chunk_y = y + neighbours[i].y; int chunk_x = x + neighbours[i].x; - if (chunk_y < 0 || chunk_y >= chunks_high) continue; - if (chunk_x < 0 || chunk_x >= chunks_wide) continue; + if (chunk_y < 0 || chunk_y >= chunks->height) continue; + if (chunk_x < 0 || chunk_x >= chunks->width) continue; - BoidList *neighbour_chunk = chunks[chunk_y][chunk_x]; + BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y); if (neighbour_chunk->count == 0) continue; uboid_t neighbour_ids[neighbour_chunk->count]; @@ -480,22 +450,80 @@ static void world_update(World *world, float dt) { uboid_t boid1; BoidsListNodeIterator it1 = boid_list_get_iterator(chunk); while (boid_list_iterator_next(&it1, &boid1)) { - assign_local_boids_b2l(world, all_local_boids, boid1, neighbour_ids, neighbour_chunk->count); + assign_local_boids_b2l(world, local_boids, boid1, neighbour_ids, neighbour_chunk->count); } } - */ } - - } RPROF_STOP(); +} + +static BoidList* world_compute_local_boids(World *world) { + Boid *boids = world->boids.data(); + int boid_count = world->boids.size(); + MemoryArena *arena = &world->frame_arena; + + RPROF_START("Alloc groups"); + BoidList *all_local_boids = (BoidList*)arena_malloc(arena, boid_count * sizeof(BoidList)); + for (int i = 0; i < boid_count; i++) { + boid_list_init(&all_local_boids[i]); + } + RPROF_STOP(); + + size_t alloc_chunks = world->frame_arena.offset; + float chunk_size = std::max(world->view_radius, 15.0f); + int chunks_wide = std::ceil(world->size.x / chunk_size) + 1; + int chunks_high = std::ceil(world->size.y / chunk_size) + 1; + + RPROF_START("Alloc chunks"); + ChunkGrid chunks; + chunkgrid_init(arena, &chunks, chunks_wide, chunks_high); + RPROF_STOP(); + + RPROF_START("Assign boids to chunks"); + for (int i = 0; i < boid_count; i++) { + Boid *boid = &boids[i]; + int chunk_x = boid->pos.x / chunk_size; + int chunk_y = boid->pos.y / chunk_size; + + boid_list_append(arena, chunkgrid_get(&chunks, chunk_x, chunk_y), i); + } + RPROF_STOP(); + + RPROF_START("Extracting boid positions"); + Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); + Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); + for (int i = 0; i < boid_count; i++) { + boid_positions[i] = boids[i].pos; + boid_dirs[i] = boids[i].dir; + } + RPROF_STOP(); + + RPROF_START("world_compute_local_boids()"); + // TODO: Use scalar version for WASM or make 128bit version. + world_compute_local_boids_simd(all_local_boids, world, &chunks); + RPROF_STOP(); + + return all_local_boids; +} + +static void world_update(World *world, float dt) { + if (world->freeze) return; + + arena_clear(&world->frame_arena); + + Boid *boids = world->boids.data(); + int boid_count = world->boids.size(); + assert(boid_count <= MAX_BOIDS); + + BoidList *list_of_local_boids = world_compute_local_boids(world); RPROF_START("Apply forces"); for (int i = 0; i < boid_count; i++) { Boid *boid = &world->boids[i]; Vector2 acc = { 0, 0 }; - BoidList *local_boids = &all_local_boids[i]; + BoidList *local_boids = &list_of_local_boids[i]; if (local_boids->count > 0) { Vector2 separation_force = { 0, 0 };