improve performance

This commit is contained in:
Rokas Puzonas 2023-08-01 18:06:55 +03:00
parent 995d41c190
commit 50c860ddf9
3 changed files with 250 additions and 194 deletions

View File

@ -12,7 +12,7 @@
#define DEBUG_ASSERT(...) assert(__VA_ARGS__)
typedef uint16_t uboid_t;
#define MAX_BOIDS 1 << (sizeof(uboid_t)*8)
#define MAX_BOIDS (1 << (sizeof(uboid_t)*8))
struct Boid {
Vector2 pos;

View File

@ -54,7 +54,7 @@ int main() {
world_init(&g_world, screen_width, screen_height);
float border = g_world.collision_avoidance_distance;
for (int i = 0; i < 33000; i++) {
for (int i = 0; i < 45000; i++) {
Boid boid;
boid_rand_init(&g_world, &boid, border);
g_world.boids.push_back(boid);

View File

@ -46,19 +46,27 @@ static void fill_avoidance_ray_angles(float *rays, int ray_count, float ray_angl
}
}
static Vector2 get_collision_avoidance_dir(World *world, Boid *boid) {
int ray_count = world->collision_avoidance_ray_count * 2 + 1;
static void fill_avoidance_ray_dirs(Vector2 *rays, int ray_count, float ray_angle) {
float ray_angles[ray_count];
fill_avoidance_ray_angles(ray_angles, ray_count, world->collision_avoidance_ray_angle);
int best_avoidance = -1;
Vector2 avoidance_dir = { 0, 0 };
float facing = std::atan2(boid->dir.y, boid->dir.x);
bool got_hit = false;
RayHitResult hit_results[ray_count];
fill_avoidance_ray_angles(ray_angles, ray_count, ray_angle);
for (int i = 0; i < ray_count; i++) {
Vector2 ray_dir = vector2_from_angle(facing + ray_angles[i]);
rays[i].x = cos(ray_angles[i]);
rays[i].y = sin(ray_angles[i]);
}
}
static Vector2 get_collision_avoidance_dir(World *world, Boid *boid, Vector2 *directions, int direction_count) {
int best_avoidance = -1;
Vector2 avoidance_dir = { 0, 0 };
bool got_hit = false;
RayHitResult hit_results[direction_count];
for (int i = 0; i < direction_count; i++) {
Vector2 ray_dir;
ray_dir.x = boid->dir.x * directions[i].x - boid->dir.y * directions[i].y;
ray_dir.y = boid->dir.x * directions[i].y + boid->dir.y * directions[i].x;
get_intersect_with_world(&hit_results[i], boid->pos, ray_dir, world);
if (hit_results[i].hit != -1 && hit_results[i].hit <= world->collision_avoidance_distance) {
got_hit = true;
@ -131,8 +139,12 @@ struct ChunkGrid {
int height;
};
static size_t chunkgrid_get_idx(ChunkGrid *grid, int x, int y) {
return y * grid->width + x;
}
static BoidList *chunkgrid_get(ChunkGrid *grid, int x, int y) {
return &grid->data[y * grid->width + x];
return &grid->data[chunkgrid_get_idx(grid, x, y)];
}
static void chunkgrid_init(MemoryArena *arena, ChunkGrid *grid, int width, int height) {
@ -217,37 +229,19 @@ static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x,
}
}
// BUG: functions `world_compute_local_boids_simd` and `world_compute_local_boids_scalar` don't give the
// same result. Investiagte further with profiling. Something related with iterating neighbour chunks.
// Difference isin't huge, but exists even on -O0.
// Also don't know which one is the more correct one.
static inline bool mm256_is_zero(__m256i x) {
return _mm256_testz_si256(x, x);
}
static inline bool mm256_is_zero(__m256 x) {
return mm256_is_zero((__m256i)x);
}
static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) {
struct b2l_cmp {
uboid_t from;
uboid_t *to_list;
uboid_t to_list_count;
__m256 *to_list_pos_x;
__m256 *to_list_pos_y;
__m256 *to_list_dir_x;
__m256 *to_list_dir_y;
int to_list_pos_count;
};
Boid *boids = world->boids.data();
int boid_count = world->boids.size();
MemoryArena *arena = &world->frame_arena;
RPROF_START("Extracting boid positions");
Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
for (int i = 0; i < boid_count; i++) {
boid_positions[i] = boids[i].pos;
boid_dirs[i] = boids[i].dir;
}
RPROF_STOP();
struct boid_pair {
uboid_t from;
uboid_t to;
};
static void world_process_local_boid_pairs(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) {
// Simplified from:
// float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
// |
@ -255,166 +249,233 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world,
// float dot_threshold = cosf(world->view_angle/2);
__m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2));
RPROF_START("Calc dot products and ranges (simd)");
// TODO: Use temp memory arena inside this profile block
int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
float view_radius_sqr = world->view_radius * world->view_radius;
__m256 view_radius = _mm256_set1_ps(view_radius_sqr);
__m256 zero = _mm256_set1_ps(0);
__m256 negative_one = _mm256_set1_ps(-1);
Boid *boids = world->boids.data();
int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32);
int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32);
int simd_iteration_count = nearest_multiple(*b2b_cmps_count, 8)/8;
for (int i = 0; i < simd_iteration_count; i++) {
__m256 from_pos_x = _mm256_set_ps(
boids[b2b_cmps[8*i+7].from].pos.x,
boids[b2b_cmps[8*i+6].from].pos.x,
boids[b2b_cmps[8*i+5].from].pos.x,
boids[b2b_cmps[8*i+4].from].pos.x,
boids[b2b_cmps[8*i+3].from].pos.x,
boids[b2b_cmps[8*i+2].from].pos.x,
boids[b2b_cmps[8*i+1].from].pos.x,
boids[b2b_cmps[8*i+0].from].pos.x
);
__m256 from_pos_y = _mm256_set_ps(
boids[b2b_cmps[8*i+7].from].pos.y,
boids[b2b_cmps[8*i+6].from].pos.y,
boids[b2b_cmps[8*i+5].from].pos.y,
boids[b2b_cmps[8*i+4].from].pos.y,
boids[b2b_cmps[8*i+3].from].pos.y,
boids[b2b_cmps[8*i+2].from].pos.y,
boids[b2b_cmps[8*i+1].from].pos.y,
boids[b2b_cmps[8*i+0].from].pos.y
);
__m256 to_pos_x = _mm256_set_ps(
boids[b2b_cmps[8*i+7].to].pos.x,
boids[b2b_cmps[8*i+6].to].pos.x,
boids[b2b_cmps[8*i+5].to].pos.x,
boids[b2b_cmps[8*i+4].to].pos.x,
boids[b2b_cmps[8*i+3].to].pos.x,
boids[b2b_cmps[8*i+2].to].pos.x,
boids[b2b_cmps[8*i+1].to].pos.x,
boids[b2b_cmps[8*i+0].to].pos.x
);
__m256 to_pos_y = _mm256_set_ps(
boids[b2b_cmps[8*i+7].to].pos.y,
boids[b2b_cmps[8*i+6].to].pos.y,
boids[b2b_cmps[8*i+5].to].pos.y,
boids[b2b_cmps[8*i+4].to].pos.y,
boids[b2b_cmps[8*i+3].to].pos.y,
boids[b2b_cmps[8*i+2].to].pos.y,
boids[b2b_cmps[8*i+1].to].pos.y,
boids[b2b_cmps[8*i+0].to].pos.y
);
__m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x);
__m256 sub_y = _mm256_sub_ps(from_pos_y, to_pos_y);
__m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, _mm256_mul_ps(sub_x, sub_x));
__m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
if (mm256_is_zero(in_range_mask)) continue;
__m256 from_dir_x = _mm256_set_ps(
boids[b2b_cmps[8*i+7].from].dir.x,
boids[b2b_cmps[8*i+6].from].dir.x,
boids[b2b_cmps[8*i+5].from].dir.x,
boids[b2b_cmps[8*i+4].from].dir.x,
boids[b2b_cmps[8*i+3].from].dir.x,
boids[b2b_cmps[8*i+2].from].dir.x,
boids[b2b_cmps[8*i+1].from].dir.x,
boids[b2b_cmps[8*i+0].from].dir.x
);
__m256 from_dir_y = _mm256_set_ps(
boids[b2b_cmps[8*i+7].from].dir.y,
boids[b2b_cmps[8*i+6].from].dir.y,
boids[b2b_cmps[8*i+5].from].dir.y,
boids[b2b_cmps[8*i+4].from].dir.y,
boids[b2b_cmps[8*i+3].from].dir.y,
boids[b2b_cmps[8*i+2].from].dir.y,
boids[b2b_cmps[8*i+1].from].dir.y,
boids[b2b_cmps[8*i+0].from].dir.y
);
__m256 to_dir_x = _mm256_set_ps(
boids[b2b_cmps[8*i+7].to].dir.x,
boids[b2b_cmps[8*i+6].to].dir.x,
boids[b2b_cmps[8*i+5].to].dir.x,
boids[b2b_cmps[8*i+4].to].dir.x,
boids[b2b_cmps[8*i+3].to].dir.x,
boids[b2b_cmps[8*i+2].to].dir.x,
boids[b2b_cmps[8*i+1].to].dir.x,
boids[b2b_cmps[8*i+0].to].dir.x
);
__m256 to_dir_y = _mm256_set_ps(
boids[b2b_cmps[8*i+7].to].dir.y,
boids[b2b_cmps[8*i+6].to].dir.y,
boids[b2b_cmps[8*i+5].to].dir.y,
boids[b2b_cmps[8*i+4].to].dir.y,
boids[b2b_cmps[8*i+3].to].dir.y,
boids[b2b_cmps[8*i+2].to].dir.y,
boids[b2b_cmps[8*i+1].to].dir.y,
boids[b2b_cmps[8*i+0].to].dir.y
);
__m256 is_length_zero = (__m256)_mm256_cmpeq_epi32((__m256i)length_sqr, (__m256i)zero);
__m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero);
__m256 x_norm = _mm256_mul_ps(sub_x, ilength);
__m256 y_norm = _mm256_mul_ps(sub_y, ilength);
__m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one);
__m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one);
__m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm));
__m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
__m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask);
__m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm));
__m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
__m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask);
_mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1);
_mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2);
for (int j = 0; j < 8; j++) {
uboid_t cmp_idx = 8*i + j;
if (cmp_idx >= *b2b_cmps_count) break;
uboid_t from_boid = b2b_cmps[cmp_idx].from;
uboid_t to_boid = b2b_cmps[cmp_idx].to;
if (do_append_mask1_f32[j]) {
boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
g_prof_interactions++;
}
if (do_append_mask2_f32[j]) {
boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
g_prof_interactions++;
}
}
}
*b2b_cmps_count = 0;
}
static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) {
Boid *boids = world->boids.data();
int boid_count = world->boids.size();
MemoryArena *arena = &world->frame_arena;
int b2b_padding = 8;
int b2b_capacity = 2048*10;
boid_pair b2b_cmps[b2b_capacity + b2b_padding];
int b2b_cmps_count = 0;
for (int i = 0; i < b2b_padding; i++) {
memset(&b2b_cmps[b2b_capacity + i], 0, sizeof(boid_pair));
}
RPROF_START("Move chunk data to static arrays");
uboid_t *static_chunks[chunks->width * chunks->height];
for (int y = 0; y < chunks->height; y++) {
for (int x = 0; x < chunks->width; x++) {
BoidList *chunk = chunkgrid_get(chunks, x, y);
size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
BoidList *chunk = &chunks->data[chunk_idx];
static_chunks[chunk_idx] = (uboid_t*)arena_malloc(arena, sizeof(uboid_t) * chunk->count);
boid_list_to_array(static_chunks[chunk_idx], chunk);
}
}
RPROF_STOP();
RPROF_START("Calc dot products and ranges (simd)");
for (int y = 0; y < chunks->height; y++) {
for (int x = 0; x < chunks->width; x++) {
size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
BoidList *chunk = &chunks->data[chunk_idx];
if (chunk->count == 0) continue;
std::vector<b2l_cmp> b2l_cmps; // TODO: remove usage of std::vec<T>, it is kinda slow
b2l_cmps.reserve(64);
uboid_t chunk_boids[chunk->count];
Vector2 chunk_boids_pos[chunk->count + 8];
Vector2 chunk_boids_dir[chunk->count + 8];
memset(chunk_boids_pos, 0, sizeof(Vector2) * (chunk->count + 8));
boid_list_to_array(chunk_boids, chunk);
for (int i = 0; i < chunk->count; i++) {
uboid_t boid = chunk_boids[i];
chunk_boids_pos[i] = boid_positions[boid];
chunk_boids_dir[i] = boid_dirs[boid];
}
uboid_t *chunk_boids = static_chunks[chunk_idx];
for (int i = 0; i < chunk->count-1; i++) {
uboid_t from_boid = chunk_boids[i];
uboid_t *to_boids = &chunk_boids[i+1];
uboid_t to_boids_count = chunk->count-i-1;
Vector2 *to_chunk_boids_pos = &chunk_boids_pos[i+1];
Vector2 *to_chunk_boids_dir = &chunk_boids_dir[i+1];
b2l_cmp cmp = {};
cmp.from = from_boid;
cmp.to_list = to_boids;
cmp.to_list_count = to_boids_count;
int to_boids_count_8 = nearest_multiple(to_boids_count, 8);
cmp.to_list_pos_count = to_boids_count_8/8;
cmp.to_list_pos_x = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
cmp.to_list_pos_y = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
cmp.to_list_dir_x = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
cmp.to_list_dir_y = (__m256*)arena_malloc(arena, sizeof(__m256) * cmp.to_list_pos_count, sizeof(__m256));
vector2_list_to_simd8(to_chunk_boids_pos, to_boids_count_8, cmp.to_list_pos_x, cmp.to_list_pos_y);
vector2_list_to_simd8(to_chunk_boids_dir, to_boids_count_8, cmp.to_list_dir_x, cmp.to_list_dir_y);
b2l_cmps.push_back(cmp);
for (int j = 0; j < to_boids_count; j++) {
// TODO:
// DEBUG_ASSERT(b2b_cmps_count < b2b_capacity-1);
b2b_cmps[b2b_cmps_count++] = {
.from = from_boid,
.to = to_boids[j]
};
}
}
Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
for (int i = 0; i < ARRAY_LEN(neighbours); i++) {
int chunk_y = y + neighbours[i].y;
int chunk_x = x + neighbours[i].x;
for (int neighbour_i = 0; neighbour_i < ARRAY_LEN(neighbours); neighbour_i++) {
int chunk_y = y + neighbours[neighbour_i].y;
if (chunk_y < 0 || chunk_y >= chunks->height) continue;
int chunk_x = x + neighbours[neighbour_i].x;
if (chunk_x < 0 || chunk_x >= chunks->width) continue;
BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y);
size_t neighbour_idx = chunkgrid_get_idx(chunks, chunk_x, chunk_y);
BoidList *neighbour_chunk = &chunks->data[neighbour_idx];
if (neighbour_chunk->count == 0) continue;
// TODO: alloc 'neighbour_ids' into scratch arena
uboid_t *neighbour_ids = (uboid_t*)arena_malloc(arena, sizeof(uboid_t)*neighbour_chunk->count);
boid_list_to_array(neighbour_ids, neighbour_chunk);
Vector2 neighbour_boids_pos[neighbour_chunk->count + 8];
Vector2 neighbour_boids_dir[neighbour_chunk->count + 8];
memset(neighbour_boids_pos, 0, sizeof(Vector2) * (neighbour_chunk->count + 8));
for (int i = 0; i < neighbour_chunk->count; i++) {
neighbour_boids_pos[i] = boid_positions[neighbour_ids[i]];
neighbour_boids_dir[i] = boid_dirs[neighbour_ids[i]];
}
int to_boids_count_8 = nearest_multiple(neighbour_chunk->count, 8);
__m256 *to_list_pos_x = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
__m256 *to_list_pos_y = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
__m256 *to_list_dir_x = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
__m256 *to_list_dir_y = (__m256*)arena_malloc(arena, sizeof(__m256) * to_boids_count_8/8, sizeof(__m256));
vector2_list_to_simd8(neighbour_boids_pos, to_boids_count_8, to_list_pos_x, to_list_pos_y);
vector2_list_to_simd8(neighbour_boids_dir, to_boids_count_8, to_list_dir_x, to_list_dir_y);
uboid_t boid1;
BoidsListNodeIterator it1 = boid_list_get_iterator(chunk);
while (boid_list_iterator_next(&it1, &boid1)) {
b2l_cmp cmp = {};
cmp.from = boid1;
cmp.to_list = neighbour_ids;
cmp.to_list_count = neighbour_chunk->count;
cmp.to_list_pos_x = to_list_pos_x;
cmp.to_list_pos_y = to_list_pos_y;
cmp.to_list_dir_x = to_list_dir_x;
cmp.to_list_dir_y = to_list_dir_y;
cmp.to_list_pos_count = to_boids_count_8/8;
b2l_cmps.push_back(cmp);
}
}
for (int i = 0; i < b2l_cmps.size(); i++) {
b2l_cmp *cmp = &b2l_cmps[i];
uboid_t from_boid = cmp->from;
Vector2 from_pos = boid_positions[from_boid];
Vector2 from_dir = boid_dirs[from_boid];
float view_radius_sqr = world->view_radius * world->view_radius;
__m256 view_radius = _mm256_set1_ps(view_radius_sqr);
__m256 from_pos_x = _mm256_set1_ps(from_pos.x);
__m256 from_pos_y = _mm256_set1_ps(from_pos.y);
__m256 from_dir_x = _mm256_set1_ps(from_dir.x);
__m256 from_dir_y = _mm256_set1_ps(from_dir.y);
__m256 zero = _mm256_set1_ps(0);
__m256 negative_one = _mm256_set1_ps(-1);
__m256i to_list_count = _mm256_set1_epi32(cmp->to_list_count);
for (int j = 0; j < cmp->to_list_pos_count; j++) {
__m256 to_pos_x = cmp->to_list_pos_x[j];
__m256 to_pos_y = cmp->to_list_pos_y[j];
__m256 to_dir_x = cmp->to_list_dir_x[j];
__m256 to_dir_y = cmp->to_list_dir_y[j];
__m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x);
__m256 sub_y = _mm256_sub_ps(from_pos_y, cmp->to_list_pos_y[j]);
__m256 x_sqr = _mm256_mul_ps(sub_x, sub_x);
__m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, x_sqr);
__m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
__m256 is_length_zero = _mm256_cmp_ps(length_sqr, zero, _CMP_EQ_OQ);
__m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero);
__m256 x_norm = _mm256_mul_ps(sub_x, ilength);
__m256 y_norm = _mm256_mul_ps(sub_y, ilength);
__m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one);
__m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one);
__m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm));
__m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
__m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask);
__m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm));
__m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
__m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask);
_mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1);
_mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2);
for (int k = 0; k < 8; k++) {
uboid_t to_boid_idx = 8*j + k;
if (to_boid_idx >= cmp->to_list_count) break;
uboid_t to_boid = cmp->to_list[to_boid_idx];
if (do_append_mask1_f32[k]) {
boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
g_prof_interactions++;
}
if (do_append_mask2_f32[k]) {
boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
g_prof_interactions++;
}
uboid_t *neighbour_boids = static_chunks[neighbour_idx];
for (int i = 0; i < chunk->count; i++) {
for (int j = 0; j < neighbour_chunk->count; j++) {
// TODO:
// DEBUG_ASSERT(b2b_cmps_count < b2b_capacity-1);
b2b_cmps[b2b_cmps_count++] = {
.from = chunk_boids[i],
.to = neighbour_boids[j]
};
}
}
}
if (b2b_cmps_count > 2048*3) {
world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count);
}
}
}
if (b2b_cmps_count > 0) {
world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count);
}
RPROF_STOP();
}
@ -470,8 +531,7 @@ static BoidList* world_compute_local_boids(World *world) {
}
RPROF_STOP();
size_t alloc_chunks = world->frame_arena.offset;
float chunk_size = std::max(world->view_radius, 15.0f);
float chunk_size = std::max(world->view_radius, 1.0f);
int chunks_wide = std::ceil(world->size.x / chunk_size) + 1;
int chunks_high = std::ceil(world->size.y / chunk_size) + 1;
@ -483,25 +543,17 @@ static BoidList* world_compute_local_boids(World *world) {
RPROF_START("Assign boids to chunks");
for (int i = 0; i < boid_count; i++) {
Boid *boid = &boids[i];
int chunk_x = boid->pos.x / chunk_size;
int chunk_y = boid->pos.y / chunk_size;
int x = boid->pos.x / chunk_size;
int y = boid->pos.y / chunk_size;
boid_list_append(arena, chunkgrid_get(&chunks, chunk_x, chunk_y), i);
}
RPROF_STOP();
RPROF_START("Extracting boid positions");
Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
for (int i = 0; i < boid_count; i++) {
boid_positions[i] = boids[i].pos;
boid_dirs[i] = boids[i].dir;
boid_list_append(arena, chunkgrid_get(&chunks, x, y), i);
}
RPROF_STOP();
RPROF_START("world_compute_local_boids()");
// TODO: Use scalar version for WASM or make 128bit version.
world_compute_local_boids_simd(all_local_boids, world, &chunks);
RPROF_STOP();
return all_local_boids;
@ -519,6 +571,10 @@ static void world_update(World *world, float dt) {
BoidList *list_of_local_boids = world_compute_local_boids(world);
RPROF_START("Apply forces");
int avoidance_ray_count = world->collision_avoidance_ray_count * 2 + 1;
Vector2 avoidance_ray_dirs[avoidance_ray_count];
fill_avoidance_ray_dirs(avoidance_ray_dirs, avoidance_ray_count, world->collision_avoidance_ray_angle);
for (int i = 0; i < boid_count; i++) {
Boid *boid = &world->boids[i];
Vector2 acc = { 0, 0 };
@ -556,7 +612,7 @@ static void world_update(World *world, float dt) {
}
// Apply obstacle avoidance to accelaration
Vector2 collision_avoidance = get_collision_avoidance_dir(world, boid);
Vector2 collision_avoidance = get_collision_avoidance_dir(world, boid, avoidance_ray_dirs, avoidance_ray_count);
acc = Vector2Add(acc, vector2_mul_value(collision_avoidance, world->collision_avoidance_strength));
acc = vector2_mul_value(acc, world->max_speed);