cleanup computation of local boids

This commit is contained in:
Rokas Puzonas 2023-07-31 01:20:39 +03:00
parent 5af509f4fd
commit 995d41c190
8 changed files with 174 additions and 149 deletions

View File

@ -14,7 +14,7 @@ WEB_HEAP_SIZE := 335544320
WEB_STACK_SIZE := 196608
WEB_SHELL := src/shell.html
COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g
COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g -flto
LINKER_FLAGS := -lraylib
# SOURCES := $(wildcard src/*.cpp)

View File

@ -2,17 +2,17 @@
#include "boid-list.hpp"
static void boid_list_init(BoidList *list)
void boid_list_init(BoidList *list)
{
list->node.next = NULL;
list->count = 0;
}
static BoidsListNodeIterator boid_list_get_iterator(BoidList *list) {
BoidsListNodeIterator boid_list_get_iterator(BoidList *list) {
return { .count = list->count, .i = 0, .node = &list->node };
}
static bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value) {
bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value) {
if (iterator->count == 0) {
return false;
}
@ -28,7 +28,7 @@ static bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *va
return true;
}
static void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid) {
void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid) {
int left_count = list->count;
BoidListNode *prev = &list->node;
BoidListNode *curr = &list->node;
@ -48,7 +48,7 @@ static void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boi
list->count++;
}
static void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid) {
void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid) {
int left_count = list->count;
BoidListNode *last = &list->node;
BoidListNode *curr = &list->node;
@ -72,8 +72,7 @@ static void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t
list->count++;
}
static void boid_list_to_array(uboid_t *result, BoidList *list)
{
void boid_list_to_array(uboid_t *result, BoidList *list) {
int i = 0;
uboid_t boid;
BoidsListNodeIterator it = boid_list_get_iterator(list);

View File

@ -22,11 +22,11 @@ struct BoidsListNodeIterator {
BoidListNode *node;
};
static BoidsListNodeIterator boid_list_get_iterator(BoidList *list);
static bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value);
BoidsListNodeIterator boid_list_get_iterator(BoidList *list);
bool boid_list_iterator_next(BoidsListNodeIterator *iterator, uboid_t *value);
static void boid_list_init(BoidList *list);
static void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid);
static void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid);
void boid_list_init(BoidList *list);
void boid_list_append(MemoryArena *arena, BoidList *list, uboid_t new_boid);
void boid_list_append_unique(MemoryArena *arena, BoidList *list, uboid_t new_boid);
static void boid_list_to_array(uboid_t *result, BoidList *list);
void boid_list_to_array(uboid_t *result, BoidList *list);

View File

@ -54,7 +54,7 @@ int main() {
world_init(&g_world, screen_width, screen_height);
float border = g_world.collision_avoidance_distance;
for (int i = 0; i < 10000; i++) {
for (int i = 0; i < 33000; i++) {
Boid boid;
boid_rand_init(&g_world, &boid, border);
g_world.boids.push_back(boid);
@ -102,9 +102,9 @@ static void profiling_test() {
}
rprof_end();
printf("interactions: %d\n", interactions);
if (interactions != 33119854) { // 22 051 739
printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", interactions - 33119854);
printf("interactions: %d\n", g_prof_interactions);
if (g_prof_interactions != 33119854) { // 22 051 739
printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", g_prof_interactions - 33119854);
}
rprof_output(NULL);

View File

@ -47,7 +47,7 @@ static void get_intersect_with_obstacles(RayHitResult *result, Vector2 ray_origi
}
}
static void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world) {
void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world) {
get_intersect_with_obstacles(result, ray_origin, ray_dir, &world->obstacles);
if (result->hit == -1 && !world->looping_walls) {

View File

@ -13,4 +13,4 @@ static float get_intersect_point(Vector2 ray_origin, Vector2 ray_dir, Vector2 li
static void set_nearest_hit(RayHitResult *nearest_hit, float hit, Vector2 line1, Vector2 line2);
static void get_intersect_with_polygon(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, Vector2 *points, int point_count);
static void get_intersect_with_obstacles(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, std::vector<Obstacle> *obstacles);
static void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world);
void get_intersect_with_world(RayHitResult *result, Vector2 ray_origin, Vector2 ray_dir, World *world);

View File

@ -6,13 +6,11 @@ struct VerticalLayout {
float gap;
};
static Rectangle rect_with_offset(Rectangle rect, Vector2 offset)
{
static Rectangle rect_with_offset(Rectangle rect, Vector2 offset) {
return { rect.x + offset.x, rect.y + offset.y, rect.width, rect.height };
}
static Rectangle rect_with_offset(Rectangle rect, float x, float y)
{
static Rectangle rect_with_offset(Rectangle rect, float x, float y) {
return { rect.x + x, rect.y + y, rect.width, rect.height };
}

View File

@ -125,24 +125,49 @@ static void world_free(World *world) {
// --------------------- Update -----------------------
static int interactions = 0;
struct ChunkGrid {
BoidList *data;
int width;
int height;
};
static int nearest_multiple(int num, int divisor)
{
static BoidList *chunkgrid_get(ChunkGrid *grid, int x, int y) {
return &grid->data[y * grid->width + x];
}
static void chunkgrid_init(MemoryArena *arena, ChunkGrid *grid, int width, int height) {
grid->data = (BoidList*)arena_malloc(arena, width * height * sizeof(BoidList));
grid->width = width;
grid->height = height;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
boid_list_init(chunkgrid_get(grid, x, y));
}
}
}
static int g_prof_interactions = 0;
static int nearest_multiple(int num, int divisor) {
return (num / divisor + (num % divisor > 0 ? 1 : 0)) * divisor;
}
// b2b = boid to boid comparison
static void assign_local_boids_b2b(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t to_boid, Vector2 offset, float length_sqr)
{
assert(to_boid != from_boid);
// b2l = boid to (list of boids) comparison
static void assign_local_boids_b2l(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t *to_boids, uboid_t to_boids_count) {
Boid *boids = world->boids.data();
// Simplified from: float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
float dot_threshold = cosf(world->view_angle/2);
float view_radius_sqr = world->view_radius * world->view_radius;
bool with_in_range = length_sqr <= (world->view_radius * world->view_radius);
if (with_in_range) {
interactions++;
for (int i = 0; i < to_boids_count; i++) {
uint16_t to_boid = to_boids[i];
assert(to_boid != from_boid);
Vector2 offset = Vector2Subtract(boids[from_boid].pos, boids[to_boid].pos);
float length_sqr = Vector2LengthSqr(offset);
if (length_sqr > view_radius_sqr) continue;
Vector2 normalized = offset;
if (length_sqr != 0)
@ -152,52 +177,19 @@ static void assign_local_boids_b2b(World *world, BoidList *local_boids, uboid_t
normalized.y *= ilength;
}
// printf("----\n");
// printf("boid:%d->%d, lengths_sqr:%f, offset:(%f,%f), look:(%f,%f)\n", from_boid, to_boid, lengths_sqr, offset.x, offset.y, boids[from_boid].dir.x, boids[from_boid].dir.y);
Boid *boids = world->boids.data();
if (Vector2DotProduct(boids[from_boid].dir, Vector2Negate(normalized)) >= dot_threshold) {
boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
g_prof_interactions++;
}
if (Vector2DotProduct(boids[to_boid].dir, normalized) >= dot_threshold) {
boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
g_prof_interactions++;
}
}
}
// b2l = boid to (list of boids) comparison
static void assign_local_boids_b2l(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t *to_boids, uboid_t to_boids_count)
{
Boid *boids = world->boids.data();
int to_boids_count_8 = nearest_multiple(to_boids_count, 8);
Vector2 to_positions[to_boids_count_8];
for (int i = 0; i < to_boids_count; i++) {
to_positions[i] = boids[to_boids[i]].pos;
}
// Vector2 offsets[to_boids_count_8];
// vector2_sub_simd8(offsets, boids[from_boid].pos, to_positions, to_boids_count_8);
// float lengths_sqrs[to_boids_count_8];
// vector2_length_sqr_simd8(lengths_sqrs, offsets, to_boids_count_8);
for (int i = 0; i < to_boids_count; i++) {
uint16_t to_boid = to_boids[i];
// Vector2 offset = offsets[i];
// float lengths_sqr = lengths_sqrs[i];
Vector2 offset = Vector2Subtract(boids[from_boid].pos, boids[to_boid].pos);
float lengths_sqr = Vector2LengthSqr(offset);
assign_local_boids_b2b(world, local_boids, from_boid, to_boid, offset, lengths_sqr);
}
}
static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x, __m256 *vecs_y)
{
assert(vec_count % 8 == 0 && "Vector2 count must be divisible by 8");
static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x, __m256 *vecs_y) {
DEBUG_ASSERT(vec_count % 8 == 0 && "Vector2 count must be divisible by 8");
for (int i = 0; i < vec_count/8; i++) {
vecs_x[i] = _mm256_set_ps(
@ -225,47 +217,27 @@ static void vector2_list_to_simd8(Vector2 *vecs, int vec_count, __m256 *vecs_x,
}
}
static void world_update(World *world, float dt) {
if (world->freeze) return;
// BUG: functions `world_compute_local_boids_simd` and `world_compute_local_boids_scalar` don't give the
// same result. Investiagte further with profiling. Something related with iterating neighbour chunks.
// Difference isin't huge, but exists even on -O0.
// Also don't know which one is the more correct one.
MemoryArena *arena = &world->frame_arena;
arena_clear(arena);
static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) {
struct b2l_cmp {
uboid_t from;
uboid_t *to_list;
uboid_t to_list_count;
__m256 *to_list_pos_x;
__m256 *to_list_pos_y;
__m256 *to_list_dir_x;
__m256 *to_list_dir_y;
int to_list_pos_count;
};
Boid *boids = world->boids.data();
int boid_count = world->boids.size();
assert(boid_count <= MAX_BOIDS);
RPROF_START("Alloc groups");
BoidList *all_local_boids = (BoidList*)arena_malloc(arena, boid_count * sizeof(BoidList));
for (int i = 0; i < boid_count; i++) {
boid_list_init(&all_local_boids[i]);
}
RPROF_STOP();
size_t alloc_chunks = world->frame_arena.offset;
float chunk_size = std::max(world->view_radius, 15.0f);
int chunks_wide = std::ceil(world->size.x / chunk_size) + 1;
int chunks_high = std::ceil(world->size.y / chunk_size) + 1;
RPROF_START("Alloc chunks");
BoidList *chunks[chunks_high][chunks_wide];
for (int y = 0; y < chunks_high; y++) {
for (int x = 0; x < chunks_wide; x++) {
chunks[y][x] = (BoidList*)arena_malloc(arena, sizeof(BoidList));
boid_list_init(chunks[y][x]);
}
}
RPROF_STOP();
RPROF_START("Creating chunks");
for (int i = 0; i < boid_count; i++) {
Boid *boid = &boids[i];
int chunk_x = boid->pos.x / chunk_size;
int chunk_y = boid->pos.y / chunk_size;
boid_list_append(arena, chunks[chunk_y][chunk_x], i);
}
RPROF_STOP();
MemoryArena *arena = &world->frame_arena;
RPROF_START("Extracting boid positions");
Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
@ -276,31 +248,21 @@ static void world_update(World *world, float dt) {
}
RPROF_STOP();
// Simplified from:
// float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
// |
// v
// float dot_threshold = cosf(world->view_angle/2);
__m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2));
int chunk_cmps = 0;
RPROF_START("Calc dot products and ranges (chunked)");
RPROF_START("Calc dot products and ranges (simd)");
// TODO: Use temp memory arena inside this profile block
// int32_t *in_range_mask_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(arena, sizeof(int32_t)*8, 32);
for (int y = 0; y < chunks_high; y++) {
Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
struct b2l_cmp {
uboid_t from;
uboid_t *to_list;
uboid_t to_list_count;
__m256 *to_list_pos_x;
__m256 *to_list_pos_y;
__m256 *to_list_dir_x;
__m256 *to_list_dir_y;
int to_list_pos_count;
};
for (int x = 0; x < chunks_wide; x++) {
BoidList *chunk = chunks[y][x];
for (int y = 0; y < chunks->height; y++) {
for (int x = 0; x < chunks->width; x++) {
BoidList *chunk = chunkgrid_get(chunks, x, y);
if (chunk->count == 0) continue;
std::vector<b2l_cmp> b2l_cmps; // TODO: remove usage of std::vec<T>, it is kinda slow
@ -340,13 +302,14 @@ static void world_update(World *world, float dt) {
b2l_cmps.push_back(cmp);
}
Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
for (int i = 0; i < ARRAY_LEN(neighbours); i++) {
int chunk_y = y + neighbours[i].y;
int chunk_x = x + neighbours[i].x;
if (chunk_y < 0 || chunk_y >= chunks_high) continue;
if (chunk_x < 0 || chunk_x >= chunks_wide) continue;
if (chunk_y < 0 || chunk_y >= chunks->height) continue;
if (chunk_x < 0 || chunk_x >= chunks->width) continue;
BoidList *neighbour_chunk = chunks[chunk_y][chunk_x];
BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y);
if (neighbour_chunk->count == 0) continue;
// TODO: alloc 'neighbour_ids' into scratch arena
@ -393,10 +356,6 @@ static void world_update(World *world, float dt) {
float view_radius_sqr = world->view_radius * world->view_radius;
// Simplified from: float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
float dot_threshold_single = cosf(world->view_angle/2);
__m256 dot_threshold = _mm256_set1_ps(dot_threshold_single);
__m256 view_radius = _mm256_set1_ps(view_radius_sqr);
__m256 from_pos_x = _mm256_set1_ps(from_pos.x);
__m256 from_pos_y = _mm256_set1_ps(from_pos.y);
@ -444,34 +403,45 @@ static void world_update(World *world, float dt) {
uboid_t to_boid = cmp->to_list[to_boid_idx];
if (do_append_mask1_f32[k]) {
boid_list_append(&world->frame_arena, &all_local_boids[from_boid], to_boid);
interactions++;
boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
g_prof_interactions++;
}
if (do_append_mask2_f32[k]) {
boid_list_append(&world->frame_arena, &all_local_boids[to_boid], from_boid);
interactions++;
boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
g_prof_interactions++;
}
}
}
}
}
}
RPROF_STOP();
}
static void world_compute_local_boids_scalar(BoidList *local_boids, World *world, ChunkGrid *chunks) {
RPROF_START("Calc dot products and ranges (scalar)");
for (int y = 0; y < chunks->height; y++) {
for (int x = 0; x < chunks->width; x++) {
BoidList *chunk = chunkgrid_get(chunks, x, y);
if (chunk->count == 0) continue;
/*
uboid_t chunk_boids[chunk->count];
boid_list_to_array(chunk_boids, chunk);
for (int i = 0; i < chunk->count-1; i++) {
uboid_t from_boid = chunk_boids[i];
uboid_t *to_boids = &chunk_boids[i+1];
uboid_t to_boids_count = chunk->count-i-1;
assign_local_boids_b2l(world, all_local_boids, from_boid, to_boids, to_boids_count);
assign_local_boids_b2l(world, local_boids, from_boid, to_boids, to_boids_count);
}
Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
for (int i = 0; i < ARRAY_LEN(neighbours); i++) {
int chunk_y = y + neighbours[i].y;
int chunk_x = x + neighbours[i].x;
if (chunk_y < 0 || chunk_y >= chunks_high) continue;
if (chunk_x < 0 || chunk_x >= chunks_wide) continue;
if (chunk_y < 0 || chunk_y >= chunks->height) continue;
if (chunk_x < 0 || chunk_x >= chunks->width) continue;
BoidList *neighbour_chunk = chunks[chunk_y][chunk_x];
BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y);
if (neighbour_chunk->count == 0) continue;
uboid_t neighbour_ids[neighbour_chunk->count];
@ -480,22 +450,80 @@ static void world_update(World *world, float dt) {
uboid_t boid1;
BoidsListNodeIterator it1 = boid_list_get_iterator(chunk);
while (boid_list_iterator_next(&it1, &boid1)) {
assign_local_boids_b2l(world, all_local_boids, boid1, neighbour_ids, neighbour_chunk->count);
assign_local_boids_b2l(world, local_boids, boid1, neighbour_ids, neighbour_chunk->count);
}
}
*/
}
}
RPROF_STOP();
}
static BoidList* world_compute_local_boids(World *world) {
Boid *boids = world->boids.data();
int boid_count = world->boids.size();
MemoryArena *arena = &world->frame_arena;
RPROF_START("Alloc groups");
BoidList *all_local_boids = (BoidList*)arena_malloc(arena, boid_count * sizeof(BoidList));
for (int i = 0; i < boid_count; i++) {
boid_list_init(&all_local_boids[i]);
}
RPROF_STOP();
size_t alloc_chunks = world->frame_arena.offset;
float chunk_size = std::max(world->view_radius, 15.0f);
int chunks_wide = std::ceil(world->size.x / chunk_size) + 1;
int chunks_high = std::ceil(world->size.y / chunk_size) + 1;
RPROF_START("Alloc chunks");
ChunkGrid chunks;
chunkgrid_init(arena, &chunks, chunks_wide, chunks_high);
RPROF_STOP();
RPROF_START("Assign boids to chunks");
for (int i = 0; i < boid_count; i++) {
Boid *boid = &boids[i];
int chunk_x = boid->pos.x / chunk_size;
int chunk_y = boid->pos.y / chunk_size;
boid_list_append(arena, chunkgrid_get(&chunks, chunk_x, chunk_y), i);
}
RPROF_STOP();
RPROF_START("Extracting boid positions");
Vector2 *boid_dirs = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
Vector2 *boid_positions = (Vector2*)arena_malloc(arena, sizeof(Vector2)*boid_count);
for (int i = 0; i < boid_count; i++) {
boid_positions[i] = boids[i].pos;
boid_dirs[i] = boids[i].dir;
}
RPROF_STOP();
RPROF_START("world_compute_local_boids()");
// TODO: Use scalar version for WASM or make 128bit version.
world_compute_local_boids_simd(all_local_boids, world, &chunks);
RPROF_STOP();
return all_local_boids;
}
static void world_update(World *world, float dt) {
if (world->freeze) return;
arena_clear(&world->frame_arena);
Boid *boids = world->boids.data();
int boid_count = world->boids.size();
assert(boid_count <= MAX_BOIDS);
BoidList *list_of_local_boids = world_compute_local_boids(world);
RPROF_START("Apply forces");
for (int i = 0; i < boid_count; i++) {
Boid *boid = &world->boids[i];
Vector2 acc = { 0, 0 };
BoidList *local_boids = &all_local_boids[i];
BoidList *local_boids = &list_of_local_boids[i];
if (local_boids->count > 0) {
Vector2 separation_force = { 0, 0 };