From 50c860ddf9a9f8d9dea2877f1bc4f33f94e73a96 Mon Sep 17 00:00:00 2001
From: Rokas Puzonas <rokas.puz@gmail.com>
Date: Tue, 1 Aug 2023 18:06:55 +0300
Subject: [PATCH] improve performance

---
 src/boid-playground.hpp |   2 +-
 src/main.cpp            |   2 +-
 src/world.cpp           | 440 ++++++++++++++++++++++------------------
 3 files changed, 250 insertions(+), 194 deletions(-)

diff --git a/src/boid-playground.hpp b/src/boid-playground.hpp
index 59a05b1..18cea93 100644
--- a/src/boid-playground.hpp
+++ b/src/boid-playground.hpp
@@ -12,7 +12,7 @@
 #define DEBUG_ASSERT(...) assert(__VA_ARGS__)
 
 typedef uint16_t uboid_t;
-#define MAX_BOIDS 1 << (sizeof(uboid_t)*8)
+#define MAX_BOIDS (1 << (sizeof(uboid_t)*8))
 
 struct Boid {
 	Vector2 pos;
diff --git a/src/main.cpp b/src/main.cpp
index 24b3ee8..c825374 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -54,7 +54,7 @@ int main() {
 	world_init(&g_world, screen_width, screen_height);
 
 	float border = g_world.collision_avoidance_distance;
-	for (int i = 0; i < 33000; i++) {
+	for (int i = 0; i < 45000; i++) {
 		Boid boid;
 		boid_rand_init(&g_world, &boid, border);
 		g_world.boids.push_back(boid);
diff --git a/src/world.cpp b/src/world.cpp
index 1662f27..dd125c0 100644
--- a/src/world.cpp
+++ b/src/world.cpp
@@ -46,19 +46,27 @@ static void fill_avoidance_ray_angles(float *rays, int ray_count, float ray_angl
 	}
 }
 
-static Vector2 get_collision_avoidance_dir(World *world, Boid *boid) {
-	int ray_count = world->collision_avoidance_ray_count * 2 + 1;
+static void fill_avoidance_ray_dirs(Vector2 *rays, int ray_count, float ray_angle) {
 	float ray_angles[ray_count];
-	fill_avoidance_ray_angles(ray_angles, ray_count, world->collision_avoidance_ray_angle);
-
-	int best_avoidance = -1;
-	Vector2 avoidance_dir = { 0, 0 };
-	float facing = std::atan2(boid->dir.y, boid->dir.x);
-	bool got_hit = false;
-	RayHitResult hit_results[ray_count];
+	fill_avoidance_ray_angles(ray_angles, ray_count, ray_angle);
 
 	for (int i = 0; i < ray_count; i++) {
-		Vector2 ray_dir = vector2_from_angle(facing + ray_angles[i]);
+		rays[i].x = cos(ray_angles[i]);
+		rays[i].y = sin(ray_angles[i]);
+	}
+}
+
+static Vector2 get_collision_avoidance_dir(World *world, Boid *boid, Vector2 *directions, int direction_count) {
+	int best_avoidance = -1;
+	Vector2 avoidance_dir = { 0, 0 };
+	bool got_hit = false;
+	RayHitResult hit_results[direction_count];
+
+	for (int i = 0; i < direction_count; i++) {
+		Vector2 ray_dir;
+		ray_dir.x = boid->dir.x * directions[i].x - boid->dir.y * directions[i].y;
+		ray_dir.y = boid->dir.x * directions[i].y + boid->dir.y * directions[i].x;
+
 		get_intersect_with_world(&hit_results[i], boid->pos, ray_dir, world);
 		if (hit_results[i].hit != -1 && hit_results[i].hit <= world->collision_avoidance_distance) {
 			got_hit = true;
@@ -131,8 +139,12 @@ struct ChunkGrid {
 	int height;
 };
 
+static size_t chunkgrid_get_idx(ChunkGrid *grid, int x, int y) {
+	return y * grid->width + x;
+}
+
 static BoidList *chunkgrid_get(ChunkGrid *grid, int x, int y) {
-	return &grid->data[y * grid->width + x];
+	return &grid->data[chunkgrid_get_idx(grid, x, y)];
 }
 
 static void chunkgrid_init(MemoryArena *arena, ChunkGrid *grid, int width, int height) {
@@ -217,37 +229,19 @@ static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x,
 	}
 }
 
-// BUG: functions `world_compute_local_boids_simd` and `world_compute_local_boids_scalar` don't give the
-// same result. Investiagte further with profiling. Something related with iterating neighbour chunks.
-// Difference isin't huge, but exists even on -O0.
-// Also don't know which one is the more correct one.
+static inline bool mm256_is_zero(__m256i x) {
+	return _mm256_testz_si256(x, x);
+}
+static inline bool mm256_is_zero(__m256 x) {
+	return mm256_is_zero((__m256i)x);
+}
 
-static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) {
-	struct b2l_cmp {
-		uboid_t from;
-		uboid_t *to_list;
-		uboid_t to_list_count;
-
-		__m256 *to_list_pos_x;
-		__m256 *to_list_pos_y;
-		__m256 *to_list_dir_x;
-		__m256 *to_list_dir_y;
-		int to_list_pos_count;
-	};
-
-	Boid *boids = world->boids.data();
-	int boid_count = world->boids.size();
-	MemoryArena *arena = &world->frame_arena;
-
-	RPROF_START("Extracting boid positions");
-	Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
-	Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
-	for (int i = 0; i < boid_count; i++) {
-		boid_positions[i] = boids[i].pos;
-		boid_dirs[i] = boids[i].dir;
-	}
-	RPROF_STOP();
+struct boid_pair {
+	uboid_t from;
+	uboid_t to;
+};
 
+static void world_process_local_boid_pairs(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) {
 	// Simplified from:
 	//   float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
 	//                         |
@@ -255,166 +249,233 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world,
 	//   float dot_threshold = cosf(world->view_angle/2);
 	__m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2));
 
-	RPROF_START("Calc dot products and ranges (simd)");
-	// TODO: Use temp memory arena inside this profile block
-	int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
-	int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
+	float view_radius_sqr = world->view_radius * world->view_radius;
+	__m256 view_radius = _mm256_set1_ps(view_radius_sqr);
+	__m256 zero = _mm256_set1_ps(0);
+	__m256 negative_one = _mm256_set1_ps(-1);
+	Boid *boids = world->boids.data();
 
+	int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32);
+	int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32);
+
+	int simd_iteration_count = nearest_multiple(*b2b_cmps_count, 8)/8;
+	for (int i = 0; i < simd_iteration_count; i++) {
+		__m256 from_pos_x = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].from].pos.x,
+			boids[b2b_cmps[8*i+6].from].pos.x,
+			boids[b2b_cmps[8*i+5].from].pos.x,
+			boids[b2b_cmps[8*i+4].from].pos.x,
+			boids[b2b_cmps[8*i+3].from].pos.x,
+			boids[b2b_cmps[8*i+2].from].pos.x,
+			boids[b2b_cmps[8*i+1].from].pos.x,
+			boids[b2b_cmps[8*i+0].from].pos.x
+		);
+
+		__m256 from_pos_y = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].from].pos.y,
+			boids[b2b_cmps[8*i+6].from].pos.y,
+			boids[b2b_cmps[8*i+5].from].pos.y,
+			boids[b2b_cmps[8*i+4].from].pos.y,
+			boids[b2b_cmps[8*i+3].from].pos.y,
+			boids[b2b_cmps[8*i+2].from].pos.y,
+			boids[b2b_cmps[8*i+1].from].pos.y,
+			boids[b2b_cmps[8*i+0].from].pos.y
+		);
+
+		__m256 to_pos_x = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].to].pos.x,
+			boids[b2b_cmps[8*i+6].to].pos.x,
+			boids[b2b_cmps[8*i+5].to].pos.x,
+			boids[b2b_cmps[8*i+4].to].pos.x,
+			boids[b2b_cmps[8*i+3].to].pos.x,
+			boids[b2b_cmps[8*i+2].to].pos.x,
+			boids[b2b_cmps[8*i+1].to].pos.x,
+			boids[b2b_cmps[8*i+0].to].pos.x
+		);
+
+		__m256 to_pos_y = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].to].pos.y,
+			boids[b2b_cmps[8*i+6].to].pos.y,
+			boids[b2b_cmps[8*i+5].to].pos.y,
+			boids[b2b_cmps[8*i+4].to].pos.y,
+			boids[b2b_cmps[8*i+3].to].pos.y,
+			boids[b2b_cmps[8*i+2].to].pos.y,
+			boids[b2b_cmps[8*i+1].to].pos.y,
+			boids[b2b_cmps[8*i+0].to].pos.y
+		);
+
+		__m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x);
+		__m256 sub_y = _mm256_sub_ps(from_pos_y, to_pos_y);
+
+		__m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, _mm256_mul_ps(sub_x, sub_x));
+		__m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
+		if (mm256_is_zero(in_range_mask)) continue;
+
+		__m256 from_dir_x = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].from].dir.x,
+			boids[b2b_cmps[8*i+6].from].dir.x,
+			boids[b2b_cmps[8*i+5].from].dir.x,
+			boids[b2b_cmps[8*i+4].from].dir.x,
+			boids[b2b_cmps[8*i+3].from].dir.x,
+			boids[b2b_cmps[8*i+2].from].dir.x,
+			boids[b2b_cmps[8*i+1].from].dir.x,
+			boids[b2b_cmps[8*i+0].from].dir.x
+		);
+
+		__m256 from_dir_y = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].from].dir.y,
+			boids[b2b_cmps[8*i+6].from].dir.y,
+			boids[b2b_cmps[8*i+5].from].dir.y,
+			boids[b2b_cmps[8*i+4].from].dir.y,
+			boids[b2b_cmps[8*i+3].from].dir.y,
+			boids[b2b_cmps[8*i+2].from].dir.y,
+			boids[b2b_cmps[8*i+1].from].dir.y,
+			boids[b2b_cmps[8*i+0].from].dir.y
+		);
+
+		__m256 to_dir_x = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].to].dir.x,
+			boids[b2b_cmps[8*i+6].to].dir.x,
+			boids[b2b_cmps[8*i+5].to].dir.x,
+			boids[b2b_cmps[8*i+4].to].dir.x,
+			boids[b2b_cmps[8*i+3].to].dir.x,
+			boids[b2b_cmps[8*i+2].to].dir.x,
+			boids[b2b_cmps[8*i+1].to].dir.x,
+			boids[b2b_cmps[8*i+0].to].dir.x
+		);
+
+		__m256 to_dir_y = _mm256_set_ps(
+			boids[b2b_cmps[8*i+7].to].dir.y,
+			boids[b2b_cmps[8*i+6].to].dir.y,
+			boids[b2b_cmps[8*i+5].to].dir.y,
+			boids[b2b_cmps[8*i+4].to].dir.y,
+			boids[b2b_cmps[8*i+3].to].dir.y,
+			boids[b2b_cmps[8*i+2].to].dir.y,
+			boids[b2b_cmps[8*i+1].to].dir.y,
+			boids[b2b_cmps[8*i+0].to].dir.y
+		);
+
+		__m256 is_length_zero = (__m256)_mm256_cmpeq_epi32((__m256i)length_sqr, (__m256i)zero);
+		__m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero);
+
+		__m256 x_norm = _mm256_mul_ps(sub_x, ilength);
+		__m256 y_norm = _mm256_mul_ps(sub_y, ilength);
+
+		__m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one);
+		__m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one);
+
+		__m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm));
+		__m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
+		__m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask);
+
+		__m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm));
+		__m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
+		__m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask);
+
+		_mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1);
+		_mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2);
+		for (int j = 0; j < 8; j++) {
+			uboid_t cmp_idx = 8*i + j;
+			if (cmp_idx >= *b2b_cmps_count) break;
+
+			uboid_t from_boid = b2b_cmps[cmp_idx].from;
+			uboid_t to_boid   = b2b_cmps[cmp_idx].to;
+			if (do_append_mask1_f32[j]) {
+				boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
+				g_prof_interactions++;
+			}
+			if (do_append_mask2_f32[j]) {
+				boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
+				g_prof_interactions++;
+			}
+		}
+	}
+
+	*b2b_cmps_count = 0;
+}
+
+static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) {
+	Boid *boids = world->boids.data();
+	int boid_count = world->boids.size();
+	MemoryArena *arena = &world->frame_arena;
+
+	int b2b_padding = 8;
+	int b2b_capacity = 2048*10;
+	boid_pair b2b_cmps[b2b_capacity + b2b_padding];
+	int b2b_cmps_count = 0;
+	for (int i = 0; i < b2b_padding; i++) {
+		memset(&b2b_cmps[b2b_capacity + i], 0, sizeof(boid_pair));
+	}
+
+	RPROF_START("Move chunk data to static arrays");
+	uboid_t *static_chunks[chunks->width * chunks->height];
 	for (int y = 0; y < chunks->height; y++) {
 		for (int x = 0; x < chunks->width; x++) {
-			BoidList *chunk = chunkgrid_get(chunks, x, y);
+			size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
+			BoidList *chunk = &chunks->data[chunk_idx];
+			static_chunks[chunk_idx] = (uboid_t*)arena_malloc(arena, sizeof(uboid_t) * chunk->count);
+			boid_list_to_array(static_chunks[chunk_idx], chunk);
+		}
+	}
+	RPROF_STOP();
+
+	RPROF_START("Calc dot products and ranges (simd)");
+	for (int y = 0; y < chunks->height; y++) {
+		for (int x = 0; x < chunks->width; x++) {
+			size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
+			BoidList *chunk = &chunks->data[chunk_idx];
 			if (chunk->count == 0) continue;
 
-			std::vector<b2l_cmp> b2l_cmps; // TODO: remove usage of std::vec<T>, it is kinda slow
-			b2l_cmps.reserve(64);
-
-			uboid_t chunk_boids[chunk->count];
-			Vector2 chunk_boids_pos[chunk->count + 8];
-			Vector2 chunk_boids_dir[chunk->count + 8];
-			memset(chunk_boids_pos, 0, sizeof(Vector2) * (chunk->count + 8));
-			boid_list_to_array(chunk_boids, chunk);
-			for (int i = 0; i < chunk->count; i++) {
-				uboid_t boid = chunk_boids[i];
-				chunk_boids_pos[i] = boid_positions[boid];
-				chunk_boids_dir[i] = boid_dirs[boid];
-			}
-
+			uboid_t *chunk_boids = static_chunks[chunk_idx];
 			for (int i = 0; i < chunk->count-1; i++) {
 				uboid_t from_boid = chunk_boids[i];
 				uboid_t *to_boids = &chunk_boids[i+1];
 				uboid_t to_boids_count = chunk->count-i-1;
-				Vector2 *to_chunk_boids_pos = &chunk_boids_pos[i+1];
-				Vector2 *to_chunk_boids_dir = &chunk_boids_dir[i+1];
 
-				b2l_cmp cmp = {};
-				cmp.from = from_boid;
-				cmp.to_list = to_boids;
-				cmp.to_list_count = to_boids_count;
-
-				int to_boids_count_8 = nearest_multiple(to_boids_count, 8);
-				cmp.to_list_pos_count = to_boids_count_8/8;
-				cmp.to_list_pos_x = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
-				cmp.to_list_pos_y = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
-				cmp.to_list_dir_x = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
-				cmp.to_list_dir_y = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
-				vector2_list_to_simd8(to_chunk_boids_pos, to_boids_count_8, cmp.to_list_pos_x, cmp.to_list_pos_y);
-				vector2_list_to_simd8(to_chunk_boids_dir, to_boids_count_8, cmp.to_list_dir_x, cmp.to_list_dir_y);
-				b2l_cmps.push_back(cmp);
+				for (int j = 0; j < to_boids_count; j++) {
+					// TODO:
+					// DEBUG_ASSERT(b2b_cmps_count < b2b_capacity-1);
+					b2b_cmps[b2b_cmps_count++] = {
+						.from = from_boid,
+						.to = to_boids[j]
+					};
+				}
 			}
 
 			Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
-			for (int i = 0; i < ARRAY_LEN(neighbours); i++) {
-				int chunk_y = y + neighbours[i].y;
-				int chunk_x = x + neighbours[i].x;
+			for (int neighbour_i = 0; neighbour_i < ARRAY_LEN(neighbours); neighbour_i++) {
+				int chunk_y = y + neighbours[neighbour_i].y;
 				if (chunk_y < 0 || chunk_y >= chunks->height) continue;
+				int chunk_x = x + neighbours[neighbour_i].x;
 				if (chunk_x < 0 || chunk_x >= chunks->width) continue;
 
-				BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y);
+				size_t neighbour_idx = chunkgrid_get_idx(chunks, chunk_x, chunk_y);
+				BoidList *neighbour_chunk = &chunks->data[neighbour_idx];
 				if (neighbour_chunk->count == 0) continue;
 
-				// TODO: alloc 'neighbour_ids' into scratch arena
-				uboid_t *neighbour_ids = (uboid_t*)arena_malloc(arena, sizeof(uboid_t)*neighbour_chunk->count);
-				boid_list_to_array(neighbour_ids, neighbour_chunk);
-
-				Vector2 neighbour_boids_pos[neighbour_chunk->count + 8];
-				Vector2 neighbour_boids_dir[neighbour_chunk->count + 8];
-				memset(neighbour_boids_pos, 0, sizeof(Vector2) * (neighbour_chunk->count + 8));
-				for (int i = 0; i < neighbour_chunk->count; i++) {
-					neighbour_boids_pos[i] = boid_positions[neighbour_ids[i]];
-					neighbour_boids_dir[i] = boid_dirs[neighbour_ids[i]];
-				}
-				int to_boids_count_8 = nearest_multiple(neighbour_chunk->count, 8);
-				__m256 *to_list_pos_x = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
-				__m256 *to_list_pos_y = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
-				__m256 *to_list_dir_x = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
-				__m256 *to_list_dir_y = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
-				vector2_list_to_simd8(neighbour_boids_pos, to_boids_count_8, to_list_pos_x, to_list_pos_y);
-				vector2_list_to_simd8(neighbour_boids_dir, to_boids_count_8, to_list_dir_x, to_list_dir_y);
-
-				uboid_t boid1;
-				BoidsListNodeIterator it1 = boid_list_get_iterator(chunk);
-				while (boid_list_iterator_next(&it1, &boid1)) {
-					b2l_cmp cmp = {};
-					cmp.from = boid1;
-					cmp.to_list = neighbour_ids;
-					cmp.to_list_count = neighbour_chunk->count;
-					cmp.to_list_pos_x = to_list_pos_x;
-					cmp.to_list_pos_y = to_list_pos_y;
-					cmp.to_list_dir_x = to_list_dir_x;
-					cmp.to_list_dir_y = to_list_dir_y;
-					cmp.to_list_pos_count = to_boids_count_8/8;
-					b2l_cmps.push_back(cmp);
-				}
-			}
-
-			for (int i = 0; i < b2l_cmps.size(); i++) {
-				b2l_cmp *cmp = &b2l_cmps[i];
-				uboid_t from_boid = cmp->from;
-
-				Vector2 from_pos = boid_positions[from_boid];
-				Vector2 from_dir = boid_dirs[from_boid];
-
-				float view_radius_sqr = world->view_radius * world->view_radius;
-
-				__m256 view_radius = _mm256_set1_ps(view_radius_sqr);
-				__m256 from_pos_x  = _mm256_set1_ps(from_pos.x);
-				__m256 from_pos_y  = _mm256_set1_ps(from_pos.y);
-				__m256 from_dir_x  = _mm256_set1_ps(from_dir.x);
-				__m256 from_dir_y  = _mm256_set1_ps(from_dir.y);
-				__m256 zero = _mm256_set1_ps(0);
-				__m256 negative_one = _mm256_set1_ps(-1);
-				__m256i to_list_count = _mm256_set1_epi32(cmp->to_list_count);
-
-				for (int j = 0; j < cmp->to_list_pos_count; j++) {
-					__m256 to_pos_x = cmp->to_list_pos_x[j];
-					__m256 to_pos_y = cmp->to_list_pos_y[j];
-					__m256 to_dir_x = cmp->to_list_dir_x[j];
-					__m256 to_dir_y = cmp->to_list_dir_y[j];
-
-					__m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x);
-					__m256 sub_y = _mm256_sub_ps(from_pos_y, cmp->to_list_pos_y[j]);
-
-					__m256 x_sqr = _mm256_mul_ps(sub_x, sub_x);
-					__m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, x_sqr);
-					__m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
-
-					__m256 is_length_zero = _mm256_cmp_ps(length_sqr, zero, _CMP_EQ_OQ);
-					__m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero);
-
-					__m256 x_norm = _mm256_mul_ps(sub_x, ilength);
-					__m256 y_norm = _mm256_mul_ps(sub_y, ilength);
-
-					__m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one);
-					__m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one);
-
-					__m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm));
-					__m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
-					__m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask);
-
-					__m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm));
-					__m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
-					__m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask);
-
-					_mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1);
-					_mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2);
-					for (int k = 0; k < 8; k++) {
-						uboid_t to_boid_idx = 8*j + k;
-						if (to_boid_idx >= cmp->to_list_count) break;
-
-						uboid_t to_boid = cmp->to_list[to_boid_idx];
-						if (do_append_mask1_f32[k]) {
-							boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
-							g_prof_interactions++;
-						}
-						if (do_append_mask2_f32[k]) {
-							boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
-							g_prof_interactions++;
-						}
+				uboid_t *neighbour_boids = static_chunks[neighbour_idx];
+				for (int i = 0; i < chunk->count; i++) {
+					for (int j = 0; j < neighbour_chunk->count; j++) {
+						// TODO:
+						// DEBUG_ASSERT(b2b_cmps_count < b2b_capacity-1);
+						b2b_cmps[b2b_cmps_count++] = {
+							.from = chunk_boids[i],
+							.to = neighbour_boids[j]
+						};
 					}
 				}
 			}
+
+			if (b2b_cmps_count > 2048*3) {
+				world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count);
+			}
 		}
 	}
+
+	if (b2b_cmps_count > 0) {
+		world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count);
+	}
+
 	RPROF_STOP();
 }
 
@@ -470,8 +531,7 @@ static BoidList* world_compute_local_boids(World *world) {
 	}
 	RPROF_STOP();
 
-	size_t alloc_chunks = world->frame_arena.offset;
-	float chunk_size = std::max(world->view_radius, 15.0f);
+	float chunk_size = std::max(world->view_radius, 1.0f);
 	int chunks_wide = std::ceil(world->size.x / chunk_size) + 1;
 	int chunks_high = std::ceil(world->size.y / chunk_size) + 1;
 
@@ -483,25 +543,17 @@ static BoidList* world_compute_local_boids(World *world) {
 	RPROF_START("Assign boids to chunks");
 	for (int i = 0; i < boid_count; i++) {
 		Boid *boid = &boids[i];
-		int chunk_x = boid->pos.x / chunk_size;
-		int chunk_y = boid->pos.y / chunk_size;
+		int x = boid->pos.x / chunk_size;
+		int y = boid->pos.y / chunk_size;
 
-		boid_list_append(arena, chunkgrid_get(&chunks, chunk_x, chunk_y), i);
-	}
-	RPROF_STOP();
-
-	RPROF_START("Extracting boid positions");
-	Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
-	Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
-	for (int i = 0; i < boid_count; i++) {
-		boid_positions[i] = boids[i].pos;
-		boid_dirs[i] = boids[i].dir;
+		boid_list_append(arena, chunkgrid_get(&chunks, x, y), i);
 	}
 	RPROF_STOP();
 
 	RPROF_START("world_compute_local_boids()");
 	// TODO: Use scalar version for WASM or make 128bit version.
 	world_compute_local_boids_simd(all_local_boids, world, &chunks);
+
 	RPROF_STOP();
 
 	return all_local_boids;
@@ -519,6 +571,10 @@ static void world_update(World *world, float dt) {
 	BoidList *list_of_local_boids = world_compute_local_boids(world);
 
 	RPROF_START("Apply forces");
+	int avoidance_ray_count = world->collision_avoidance_ray_count * 2 + 1;
+	Vector2 avoidance_ray_dirs[avoidance_ray_count];
+	fill_avoidance_ray_dirs(avoidance_ray_dirs, avoidance_ray_count, world->collision_avoidance_ray_angle);
+
 	for (int i = 0; i < boid_count; i++) {
 		Boid *boid = &world->boids[i];
 		Vector2 acc = { 0, 0 };
@@ -556,7 +612,7 @@ static void world_update(World *world, float dt) {
 		}
 
 		// Apply obstacle avoidance to accelaration
-		Vector2 collision_avoidance = get_collision_avoidance_dir(world, boid);
+		Vector2 collision_avoidance = get_collision_avoidance_dir(world, boid, avoidance_ray_dirs, avoidance_ray_count);
 		acc = Vector2Add(acc, vector2_mul_value(collision_avoidance, world->collision_avoidance_strength));
 
 		acc = vector2_mul_value(acc, world->max_speed);