From 50c860ddf9a9f8d9dea2877f1bc4f33f94e73a96 Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Tue, 1 Aug 2023 18:06:55 +0300 Subject: [PATCH] improve performance --- src/boid-playground.hpp | 2 +- src/main.cpp | 2 +- src/world.cpp | 440 ++++++++++++++++++++++------------------ 3 files changed, 250 insertions(+), 194 deletions(-) diff --git a/src/boid-playground.hpp b/src/boid-playground.hpp index 59a05b1..18cea93 100644 --- a/src/boid-playground.hpp +++ b/src/boid-playground.hpp @@ -12,7 +12,7 @@ #define DEBUG_ASSERT(...) assert(__VA_ARGS__) typedef uint16_t uboid_t; -#define MAX_BOIDS 1 << (sizeof(uboid_t)*8) +#define MAX_BOIDS (1 << (sizeof(uboid_t)*8)) struct Boid { Vector2 pos; diff --git a/src/main.cpp b/src/main.cpp index 24b3ee8..c825374 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -54,7 +54,7 @@ int main() { world_init(&g_world, screen_width, screen_height); float border = g_world.collision_avoidance_distance; - for (int i = 0; i < 33000; i++) { + for (int i = 0; i < 45000; i++) { Boid boid; boid_rand_init(&g_world, &boid, border); g_world.boids.push_back(boid); diff --git a/src/world.cpp b/src/world.cpp index 1662f27..dd125c0 100644 --- a/src/world.cpp +++ b/src/world.cpp @@ -46,19 +46,27 @@ static void fill_avoidance_ray_angles(float *rays, int ray_count, float ray_angl } } -static Vector2 get_collision_avoidance_dir(World *world, Boid *boid) { - int ray_count = world->collision_avoidance_ray_count * 2 + 1; +static void fill_avoidance_ray_dirs(Vector2 *rays, int ray_count, float ray_angle) { float ray_angles[ray_count]; - fill_avoidance_ray_angles(ray_angles, ray_count, world->collision_avoidance_ray_angle); - - int best_avoidance = -1; - Vector2 avoidance_dir = { 0, 0 }; - float facing = std::atan2(boid->dir.y, boid->dir.x); - bool got_hit = false; - RayHitResult hit_results[ray_count]; + fill_avoidance_ray_angles(ray_angles, ray_count, ray_angle); for (int i = 0; i < ray_count; i++) { - Vector2 ray_dir = vector2_from_angle(facing + ray_angles[i]); + rays[i].x = cos(ray_angles[i]); + rays[i].y = sin(ray_angles[i]); + } +} + +static Vector2 get_collision_avoidance_dir(World *world, Boid *boid, Vector2 *directions, int direction_count) { + int best_avoidance = -1; + Vector2 avoidance_dir = { 0, 0 }; + bool got_hit = false; + RayHitResult hit_results[direction_count]; + + for (int i = 0; i < direction_count; i++) { + Vector2 ray_dir; + ray_dir.x = boid->dir.x * directions[i].x - boid->dir.y * directions[i].y; + ray_dir.y = boid->dir.x * directions[i].y + boid->dir.y * directions[i].x; + get_intersect_with_world(&hit_results[i], boid->pos, ray_dir, world); if (hit_results[i].hit != -1 && hit_results[i].hit <= world->collision_avoidance_distance) { got_hit = true; @@ -131,8 +139,12 @@ struct ChunkGrid { int height; }; +static size_t chunkgrid_get_idx(ChunkGrid *grid, int x, int y) { + return y * grid->width + x; +} + static BoidList *chunkgrid_get(ChunkGrid *grid, int x, int y) { - return &grid->data[y * grid->width + x]; + return &grid->data[chunkgrid_get_idx(grid, x, y)]; } static void chunkgrid_init(MemoryArena *arena, ChunkGrid *grid, int width, int height) { @@ -217,37 +229,19 @@ static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x, } } -// BUG: functions `world_compute_local_boids_simd` and `world_compute_local_boids_scalar` don't give the -// same result. Investiagte further with profiling. Something related with iterating neighbour chunks. -// Difference isin't huge, but exists even on -O0. -// Also don't know which one is the more correct one. +static inline bool mm256_is_zero(__m256i x) { + return _mm256_testz_si256(x, x); +} +static inline bool mm256_is_zero(__m256 x) { + return mm256_is_zero((__m256i)x); +} -static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) { - struct b2l_cmp { - uboid_t from; - uboid_t *to_list; - uboid_t to_list_count; - - __m256 *to_list_pos_x; - __m256 *to_list_pos_y; - __m256 *to_list_dir_x; - __m256 *to_list_dir_y; - int to_list_pos_count; - }; - - Boid *boids = world->boids.data(); - int boid_count = world->boids.size(); - MemoryArena *arena = &world->frame_arena; - - RPROF_START("Extracting boid positions"); - Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); - Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); - for (int i = 0; i < boid_count; i++) { - boid_positions[i] = boids[i].pos; - boid_dirs[i] = boids[i].dir; - } - RPROF_STOP(); +struct boid_pair { + uboid_t from; + uboid_t to; +}; +static void world_process_local_boid_pairs(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) { // Simplified from: // float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2)); // | @@ -255,166 +249,233 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world, // float dot_threshold = cosf(world->view_angle/2); __m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2)); - RPROF_START("Calc dot products and ranges (simd)"); - // TODO: Use temp memory arena inside this profile block - int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32); - int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32); + float view_radius_sqr = world->view_radius * world->view_radius; + __m256 view_radius = _mm256_set1_ps(view_radius_sqr); + __m256 zero = _mm256_set1_ps(0); + __m256 negative_one = _mm256_set1_ps(-1); + Boid *boids = world->boids.data(); + int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32); + int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32); + + int simd_iteration_count = nearest_multiple(*b2b_cmps_count, 8)/8; + for (int i = 0; i < simd_iteration_count; i++) { + __m256 from_pos_x = _mm256_set_ps( + boids[b2b_cmps[8*i+7].from].pos.x, + boids[b2b_cmps[8*i+6].from].pos.x, + boids[b2b_cmps[8*i+5].from].pos.x, + boids[b2b_cmps[8*i+4].from].pos.x, + boids[b2b_cmps[8*i+3].from].pos.x, + boids[b2b_cmps[8*i+2].from].pos.x, + boids[b2b_cmps[8*i+1].from].pos.x, + boids[b2b_cmps[8*i+0].from].pos.x + ); + + __m256 from_pos_y = _mm256_set_ps( + boids[b2b_cmps[8*i+7].from].pos.y, + boids[b2b_cmps[8*i+6].from].pos.y, + boids[b2b_cmps[8*i+5].from].pos.y, + boids[b2b_cmps[8*i+4].from].pos.y, + boids[b2b_cmps[8*i+3].from].pos.y, + boids[b2b_cmps[8*i+2].from].pos.y, + boids[b2b_cmps[8*i+1].from].pos.y, + boids[b2b_cmps[8*i+0].from].pos.y + ); + + __m256 to_pos_x = _mm256_set_ps( + boids[b2b_cmps[8*i+7].to].pos.x, + boids[b2b_cmps[8*i+6].to].pos.x, + boids[b2b_cmps[8*i+5].to].pos.x, + boids[b2b_cmps[8*i+4].to].pos.x, + boids[b2b_cmps[8*i+3].to].pos.x, + boids[b2b_cmps[8*i+2].to].pos.x, + boids[b2b_cmps[8*i+1].to].pos.x, + boids[b2b_cmps[8*i+0].to].pos.x + ); + + __m256 to_pos_y = _mm256_set_ps( + boids[b2b_cmps[8*i+7].to].pos.y, + boids[b2b_cmps[8*i+6].to].pos.y, + boids[b2b_cmps[8*i+5].to].pos.y, + boids[b2b_cmps[8*i+4].to].pos.y, + boids[b2b_cmps[8*i+3].to].pos.y, + boids[b2b_cmps[8*i+2].to].pos.y, + boids[b2b_cmps[8*i+1].to].pos.y, + boids[b2b_cmps[8*i+0].to].pos.y + ); + + __m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x); + __m256 sub_y = _mm256_sub_ps(from_pos_y, to_pos_y); + + __m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, _mm256_mul_ps(sub_x, sub_x)); + __m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ); + if (mm256_is_zero(in_range_mask)) continue; + + __m256 from_dir_x = _mm256_set_ps( + boids[b2b_cmps[8*i+7].from].dir.x, + boids[b2b_cmps[8*i+6].from].dir.x, + boids[b2b_cmps[8*i+5].from].dir.x, + boids[b2b_cmps[8*i+4].from].dir.x, + boids[b2b_cmps[8*i+3].from].dir.x, + boids[b2b_cmps[8*i+2].from].dir.x, + boids[b2b_cmps[8*i+1].from].dir.x, + boids[b2b_cmps[8*i+0].from].dir.x + ); + + __m256 from_dir_y = _mm256_set_ps( + boids[b2b_cmps[8*i+7].from].dir.y, + boids[b2b_cmps[8*i+6].from].dir.y, + boids[b2b_cmps[8*i+5].from].dir.y, + boids[b2b_cmps[8*i+4].from].dir.y, + boids[b2b_cmps[8*i+3].from].dir.y, + boids[b2b_cmps[8*i+2].from].dir.y, + boids[b2b_cmps[8*i+1].from].dir.y, + boids[b2b_cmps[8*i+0].from].dir.y + ); + + __m256 to_dir_x = _mm256_set_ps( + boids[b2b_cmps[8*i+7].to].dir.x, + boids[b2b_cmps[8*i+6].to].dir.x, + boids[b2b_cmps[8*i+5].to].dir.x, + boids[b2b_cmps[8*i+4].to].dir.x, + boids[b2b_cmps[8*i+3].to].dir.x, + boids[b2b_cmps[8*i+2].to].dir.x, + boids[b2b_cmps[8*i+1].to].dir.x, + boids[b2b_cmps[8*i+0].to].dir.x + ); + + __m256 to_dir_y = _mm256_set_ps( + boids[b2b_cmps[8*i+7].to].dir.y, + boids[b2b_cmps[8*i+6].to].dir.y, + boids[b2b_cmps[8*i+5].to].dir.y, + boids[b2b_cmps[8*i+4].to].dir.y, + boids[b2b_cmps[8*i+3].to].dir.y, + boids[b2b_cmps[8*i+2].to].dir.y, + boids[b2b_cmps[8*i+1].to].dir.y, + boids[b2b_cmps[8*i+0].to].dir.y + ); + + __m256 is_length_zero = (__m256)_mm256_cmpeq_epi32((__m256i)length_sqr, (__m256i)zero); + __m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero); + + __m256 x_norm = _mm256_mul_ps(sub_x, ilength); + __m256 y_norm = _mm256_mul_ps(sub_y, ilength); + + __m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one); + __m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one); + + __m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm)); + __m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ); + __m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask); + + __m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm)); + __m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ); + __m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask); + + _mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1); + _mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2); + for (int j = 0; j < 8; j++) { + uboid_t cmp_idx = 8*i + j; + if (cmp_idx >= *b2b_cmps_count) break; + + uboid_t from_boid = b2b_cmps[cmp_idx].from; + uboid_t to_boid = b2b_cmps[cmp_idx].to; + if (do_append_mask1_f32[j]) { + boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid); + g_prof_interactions++; + } + if (do_append_mask2_f32[j]) { + boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid); + g_prof_interactions++; + } + } + } + + *b2b_cmps_count = 0; +} + +static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) { + Boid *boids = world->boids.data(); + int boid_count = world->boids.size(); + MemoryArena *arena = &world->frame_arena; + + int b2b_padding = 8; + int b2b_capacity = 2048*10; + boid_pair b2b_cmps[b2b_capacity + b2b_padding]; + int b2b_cmps_count = 0; + for (int i = 0; i < b2b_padding; i++) { + memset(&b2b_cmps[b2b_capacity + i], 0, sizeof(boid_pair)); + } + + RPROF_START("Move chunk data to static arrays"); + uboid_t *static_chunks[chunks->width * chunks->height]; for (int y = 0; y < chunks->height; y++) { for (int x = 0; x < chunks->width; x++) { - BoidList *chunk = chunkgrid_get(chunks, x, y); + size_t chunk_idx = chunkgrid_get_idx(chunks, x, y); + BoidList *chunk = &chunks->data[chunk_idx]; + static_chunks[chunk_idx] = (uboid_t*)arena_malloc(arena, sizeof(uboid_t) * chunk->count); + boid_list_to_array(static_chunks[chunk_idx], chunk); + } + } + RPROF_STOP(); + + RPROF_START("Calc dot products and ranges (simd)"); + for (int y = 0; y < chunks->height; y++) { + for (int x = 0; x < chunks->width; x++) { + size_t chunk_idx = chunkgrid_get_idx(chunks, x, y); + BoidList *chunk = &chunks->data[chunk_idx]; if (chunk->count == 0) continue; - std::vector b2l_cmps; // TODO: remove usage of std::vec, it is kinda slow - b2l_cmps.reserve(64); - - uboid_t chunk_boids[chunk->count]; - Vector2 chunk_boids_pos[chunk->count + 8]; - Vector2 chunk_boids_dir[chunk->count + 8]; - memset(chunk_boids_pos, 0, sizeof(Vector2) * (chunk->count + 8)); - boid_list_to_array(chunk_boids, chunk); - for (int i = 0; i < chunk->count; i++) { - uboid_t boid = chunk_boids[i]; - chunk_boids_pos[i] = boid_positions[boid]; - chunk_boids_dir[i] = boid_dirs[boid]; - } - + uboid_t *chunk_boids = static_chunks[chunk_idx]; for (int i = 0; i < chunk->count-1; i++) { uboid_t from_boid = chunk_boids[i]; uboid_t *to_boids = &chunk_boids[i+1]; uboid_t to_boids_count = chunk->count-i-1; - Vector2 *to_chunk_boids_pos = &chunk_boids_pos[i+1]; - Vector2 *to_chunk_boids_dir = &chunk_boids_dir[i+1]; - b2l_cmp cmp = {}; - cmp.from = from_boid; - cmp.to_list = to_boids; - cmp.to_list_count = to_boids_count; - - int to_boids_count_8 = nearest_multiple(to_boids_count, 8); - cmp.to_list_pos_count = to_boids_count_8/8; - cmp.to_list_pos_x = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256)); - cmp.to_list_pos_y = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256)); - cmp.to_list_dir_x = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256)); - cmp.to_list_dir_y = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256)); - vector2_list_to_simd8(to_chunk_boids_pos, to_boids_count_8, cmp.to_list_pos_x, cmp.to_list_pos_y); - vector2_list_to_simd8(to_chunk_boids_dir, to_boids_count_8, cmp.to_list_dir_x, cmp.to_list_dir_y); - b2l_cmps.push_back(cmp); + for (int j = 0; j < to_boids_count; j++) { + // TODO: + // DEBUG_ASSERT(b2b_cmps_count < b2b_capacity-1); + b2b_cmps[b2b_cmps_count++] = { + .from = from_boid, + .to = to_boids[j] + }; + } } Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } }; - for (int i = 0; i < ARRAY_LEN(neighbours); i++) { - int chunk_y = y + neighbours[i].y; - int chunk_x = x + neighbours[i].x; + for (int neighbour_i = 0; neighbour_i < ARRAY_LEN(neighbours); neighbour_i++) { + int chunk_y = y + neighbours[neighbour_i].y; if (chunk_y < 0 || chunk_y >= chunks->height) continue; + int chunk_x = x + neighbours[neighbour_i].x; if (chunk_x < 0 || chunk_x >= chunks->width) continue; - BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y); + size_t neighbour_idx = chunkgrid_get_idx(chunks, chunk_x, chunk_y); + BoidList *neighbour_chunk = &chunks->data[neighbour_idx]; if (neighbour_chunk->count == 0) continue; - // TODO: alloc 'neighbour_ids' into scratch arena - uboid_t *neighbour_ids = (uboid_t*)arena_malloc(arena, sizeof(uboid_t)*neighbour_chunk->count); - boid_list_to_array(neighbour_ids, neighbour_chunk); - - Vector2 neighbour_boids_pos[neighbour_chunk->count + 8]; - Vector2 neighbour_boids_dir[neighbour_chunk->count + 8]; - memset(neighbour_boids_pos, 0, sizeof(Vector2) * (neighbour_chunk->count + 8)); - for (int i = 0; i < neighbour_chunk->count; i++) { - neighbour_boids_pos[i] = boid_positions[neighbour_ids[i]]; - neighbour_boids_dir[i] = boid_dirs[neighbour_ids[i]]; - } - int to_boids_count_8 = nearest_multiple(neighbour_chunk->count, 8); - __m256 *to_list_pos_x = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256)); - __m256 *to_list_pos_y = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256)); - __m256 *to_list_dir_x = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256)); - __m256 *to_list_dir_y = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256)); - vector2_list_to_simd8(neighbour_boids_pos, to_boids_count_8, to_list_pos_x, to_list_pos_y); - vector2_list_to_simd8(neighbour_boids_dir, to_boids_count_8, to_list_dir_x, to_list_dir_y); - - uboid_t boid1; - BoidsListNodeIterator it1 = boid_list_get_iterator(chunk); - while (boid_list_iterator_next(&it1, &boid1)) { - b2l_cmp cmp = {}; - cmp.from = boid1; - cmp.to_list = neighbour_ids; - cmp.to_list_count = neighbour_chunk->count; - cmp.to_list_pos_x = to_list_pos_x; - cmp.to_list_pos_y = to_list_pos_y; - cmp.to_list_dir_x = to_list_dir_x; - cmp.to_list_dir_y = to_list_dir_y; - cmp.to_list_pos_count = to_boids_count_8/8; - b2l_cmps.push_back(cmp); - } - } - - for (int i = 0; i < b2l_cmps.size(); i++) { - b2l_cmp *cmp = &b2l_cmps[i]; - uboid_t from_boid = cmp->from; - - Vector2 from_pos = boid_positions[from_boid]; - Vector2 from_dir = boid_dirs[from_boid]; - - float view_radius_sqr = world->view_radius * world->view_radius; - - __m256 view_radius = _mm256_set1_ps(view_radius_sqr); - __m256 from_pos_x = _mm256_set1_ps(from_pos.x); - __m256 from_pos_y = _mm256_set1_ps(from_pos.y); - __m256 from_dir_x = _mm256_set1_ps(from_dir.x); - __m256 from_dir_y = _mm256_set1_ps(from_dir.y); - __m256 zero = _mm256_set1_ps(0); - __m256 negative_one = _mm256_set1_ps(-1); - __m256i to_list_count = _mm256_set1_epi32(cmp->to_list_count); - - for (int j = 0; j < cmp->to_list_pos_count; j++) { - __m256 to_pos_x = cmp->to_list_pos_x[j]; - __m256 to_pos_y = cmp->to_list_pos_y[j]; - __m256 to_dir_x = cmp->to_list_dir_x[j]; - __m256 to_dir_y = cmp->to_list_dir_y[j]; - - __m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x); - __m256 sub_y = _mm256_sub_ps(from_pos_y, cmp->to_list_pos_y[j]); - - __m256 x_sqr = _mm256_mul_ps(sub_x, sub_x); - __m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, x_sqr); - __m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ); - - __m256 is_length_zero = _mm256_cmp_ps(length_sqr, zero, _CMP_EQ_OQ); - __m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero); - - __m256 x_norm = _mm256_mul_ps(sub_x, ilength); - __m256 y_norm = _mm256_mul_ps(sub_y, ilength); - - __m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one); - __m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one); - - __m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm)); - __m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ); - __m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask); - - __m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm)); - __m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ); - __m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask); - - _mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1); - _mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2); - for (int k = 0; k < 8; k++) { - uboid_t to_boid_idx = 8*j + k; - if (to_boid_idx >= cmp->to_list_count) break; - - uboid_t to_boid = cmp->to_list[to_boid_idx]; - if (do_append_mask1_f32[k]) { - boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid); - g_prof_interactions++; - } - if (do_append_mask2_f32[k]) { - boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid); - g_prof_interactions++; - } + uboid_t *neighbour_boids = static_chunks[neighbour_idx]; + for (int i = 0; i < chunk->count; i++) { + for (int j = 0; j < neighbour_chunk->count; j++) { + // TODO: + // DEBUG_ASSERT(b2b_cmps_count < b2b_capacity-1); + b2b_cmps[b2b_cmps_count++] = { + .from = chunk_boids[i], + .to = neighbour_boids[j] + }; } } } + + if (b2b_cmps_count > 2048*3) { + world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count); + } } } + + if (b2b_cmps_count > 0) { + world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count); + } + RPROF_STOP(); } @@ -470,8 +531,7 @@ static BoidList* world_compute_local_boids(World *world) { } RPROF_STOP(); - size_t alloc_chunks = world->frame_arena.offset; - float chunk_size = std::max(world->view_radius, 15.0f); + float chunk_size = std::max(world->view_radius, 1.0f); int chunks_wide = std::ceil(world->size.x / chunk_size) + 1; int chunks_high = std::ceil(world->size.y / chunk_size) + 1; @@ -483,25 +543,17 @@ static BoidList* world_compute_local_boids(World *world) { RPROF_START("Assign boids to chunks"); for (int i = 0; i < boid_count; i++) { Boid *boid = &boids[i]; - int chunk_x = boid->pos.x / chunk_size; - int chunk_y = boid->pos.y / chunk_size; + int x = boid->pos.x / chunk_size; + int y = boid->pos.y / chunk_size; - boid_list_append(arena, chunkgrid_get(&chunks, chunk_x, chunk_y), i); - } - RPROF_STOP(); - - RPROF_START("Extracting boid positions"); - Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); - Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count); - for (int i = 0; i < boid_count; i++) { - boid_positions[i] = boids[i].pos; - boid_dirs[i] = boids[i].dir; + boid_list_append(arena, chunkgrid_get(&chunks, x, y), i); } RPROF_STOP(); RPROF_START("world_compute_local_boids()"); // TODO: Use scalar version for WASM or make 128bit version. world_compute_local_boids_simd(all_local_boids, world, &chunks); + RPROF_STOP(); return all_local_boids; @@ -519,6 +571,10 @@ static void world_update(World *world, float dt) { BoidList *list_of_local_boids = world_compute_local_boids(world); RPROF_START("Apply forces"); + int avoidance_ray_count = world->collision_avoidance_ray_count * 2 + 1; + Vector2 avoidance_ray_dirs[avoidance_ray_count]; + fill_avoidance_ray_dirs(avoidance_ray_dirs, avoidance_ray_count, world->collision_avoidance_ray_angle); + for (int i = 0; i < boid_count; i++) { Boid *boid = &world->boids[i]; Vector2 acc = { 0, 0 }; @@ -556,7 +612,7 @@ static void world_update(World *world, float dt) { } // Apply obstacle avoidance to accelaration - Vector2 collision_avoidance = get_collision_avoidance_dir(world, boid); + Vector2 collision_avoidance = get_collision_avoidance_dir(world, boid, avoidance_ray_dirs, avoidance_ray_count); acc = Vector2Add(acc, vector2_mul_value(collision_avoidance, world->collision_avoidance_strength)); acc = vector2_mul_value(acc, world->max_speed);