add define to switch between simd 128 and 256

This commit is contained in:
Rokas Puzonas 2023-08-02 21:34:12 +03:00
parent d25d8b5490
commit be2c56fb28
5 changed files with 124 additions and 233 deletions

View File

@ -14,7 +14,7 @@ WEB_HEAP_SIZE := 335544320
WEB_STACK_SIZE := 196608 WEB_STACK_SIZE := 196608
WEB_SHELL := src/shell.html WEB_SHELL := src/shell.html
COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g -flto COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g -flto -msse4.2 -mavx
COMPILER_FLAGS += -DRPROF_IMPLEMENTATION COMPILER_FLAGS += -DRPROF_IMPLEMENTATION
COMPILER_FLAGS += -DRAYGUI_IMPLEMENTATION COMPILER_FLAGS += -DRAYGUI_IMPLEMENTATION
# COMPILER_FLAGS += -DRLGL_IMPLEMENTATION # COMPILER_FLAGS += -DRLGL_IMPLEMENTATION
@ -75,6 +75,7 @@ ifeq ($(PLATFORM), web)
EMSCRIPTEN_PATH ?= $(EMSDK_PATH)/upstream/emscripten EMSCRIPTEN_PATH ?= $(EMSDK_PATH)/upstream/emscripten
COMPILER_FLAGS += -I$(EMSCRIPTEN_PATH)/cache/sysroot/include COMPILER_FLAGS += -I$(EMSCRIPTEN_PATH)/cache/sysroot/include
COMPILER_FLAGS += -D_DEFAULT_SOURCE COMPILER_FLAGS += -D_DEFAULT_SOURCE
COMPILER_FLAGS += -msimd128
LINKER_FLAGS += -s USE_GLFW=3 LINKER_FLAGS += -s USE_GLFW=3
LINKER_FLAGS += -s FORCE_FILESYSTEM=1 LINKER_FLAGS += -s FORCE_FILESYSTEM=1
LINKER_FLAGS += $(RAYLIB_RELEASE_PATH)/libraylib.a LINKER_FLAGS += $(RAYLIB_RELEASE_PATH)/libraylib.a
@ -83,7 +84,7 @@ ifeq ($(PLATFORM), web)
LINKER_FLAGS += -s STACK_SIZE=$(WEB_STACK_SIZE) LINKER_FLAGS += -s STACK_SIZE=$(WEB_STACK_SIZE)
LIB_DEPENDENCIES += emsdk LIB_DEPENDENCIES += emsdk
else else
COMPILER_FLAGS += -march=native COMPILER_FLAGS += -mavx2 -mfma -DSIMD256
endif endif
LINKER_FLAGS += -L$(RAYLIB_RELEASE_PATH) LINKER_FLAGS += -L$(RAYLIB_RELEASE_PATH)

View File

@ -19,10 +19,6 @@
#include "world.cpp" #include "world.cpp"
#include "ui.cpp" #include "ui.cpp"
//#define USE_TEST_MAIN
// #include "raygui.h"
#define FRAMERATE 60 #define FRAMERATE 60
#define TIME_PER_FRAME (1.0/FRAMERATE) #define TIME_PER_FRAME (1.0/FRAMERATE)
@ -37,8 +33,8 @@ void UpdateDrawFrame();
static void profiling_test(); static void profiling_test();
int main() { int main() {
// profiling_test(); profiling_test();
// return 0; return 0;
SetTraceLogLevel(LOG_TRACE); SetTraceLogLevel(LOG_TRACE);
@ -89,7 +85,7 @@ static void profiling_test() {
SetRandomSeed(10); SetRandomSeed(10);
float border = g_visuals.boid_edge_size; float border = g_visuals.boid_edge_size;
for (int i = 0; i < 45000; i++) { for (int i = 0; i < 50000; i++) {
Boid boid; Boid boid;
boid_rand_init(&g_world, &boid, border); boid_rand_init(&g_world, &boid, border);
g_world.boids.push_back(boid); g_world.boids.push_back(boid);
@ -105,8 +101,9 @@ static void profiling_test() {
rprof_end(); rprof_end();
printf("interactions: %d\n", g_prof_interactions); printf("interactions: %d\n", g_prof_interactions);
if (g_prof_interactions != 33119854) { // 22 051 739 int expected_interactions = 40501984;
printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", g_prof_interactions - 33119854); if (g_prof_interactions != expected_interactions) { // 22 051 739
printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", g_prof_interactions - expected_interactions);
} }
rprof_output(NULL); rprof_output(NULL);

View File

@ -15,9 +15,9 @@
<meta property="og:title" content="Boid playground"> <meta property="og:title" content="Boid playground">
<meta property="og:image:type" content="image/png"> <meta property="og:image:type" content="image/png">
<meta property="og:image" content="https://www.raylib.com/common/img/raylib_logo.png"> <meta property="og:image" content="https://www.raylib.com/common/img/raylib_logo.png">
<meta property="og:site_name" content="raylib.com"> <!-- <meta property="og:site_name" content="rpuzonas.com"> -->
<meta property="og:url" content="https://www.raylib.com/games.html"> <!-- <meta property="og:url" content="https://www.raylib.com/games.html"> -->
<meta property="og:description" content="New raylib web videogame, developed using raylib videogames library"> <meta property="og:description" content="Play around and figure out how boids work">
<!-- Favicon --> <!-- Favicon -->
<link rel="shortcut icon" href="https://www.raylib.com/favicon.ico"> <link rel="shortcut icon" href="https://www.raylib.com/favicon.ico">

46
src/simd.cpp Normal file
View File

@ -0,0 +1,46 @@
#include <stdbool.h>
#include <immintrin.h>
#ifdef SIMD256
#define SIMD_BITS 256
typedef __m256 __simd;
typedef __m256i __simdi;
static inline bool mm_is_zero(__simdi x) {
return _mm256_testz_si256(x, x);
}
#define mm_set1_ps(x) _mm256_set1_ps(x)
#define mm_sub_ps(a, b) _mm256_sub_ps(a, b)
#define mm_mul_ps(a, b) _mm256_mul_ps(a, b)
#define mm_and_ps(a, b) _mm256_and_ps(a, b)
#define mm_store_ps(a, b) _mm256_store_ps(a, b)
#define mm_fmadd_ps(a, b, c) _mm256_fmadd_ps(a, b, c)
#define mm_cmp_ps(a, b, c) _mm256_cmp_ps(a, b, c)
#define mm_rsqrt_ps(a) _mm256_rsqrt_ps(a)
#define mm_blendv_ps(a, b, c) _mm256_blendv_ps(a, b, c)
#define mm_cmpeq_epi32(a, b) _mm256_cmpeq_epi32(a, b)
#else
#define SIMD_BITS 128
typedef __m128 __simd;
typedef __m128i __simdi;
static inline bool mm_is_zero(__simdi x) {
return _mm_test_all_zeros(x, _mm_set1_epi8(0xFF));
}
#define mm_set1_ps(x) _mm_set1_ps(x)
#define mm_sub_ps(a, b) _mm_sub_ps(a, b)
#define mm_mul_ps(a, b) _mm_mul_ps(a, b)
#define mm_and_ps(a, b) _mm_and_ps(a, b)
#define mm_store_ps(a, b) _mm_store_ps(a, b)
#define mm_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
#define mm_cmp_ps(a, b, c) _mm_cmp_ps(a, b, c)
#define mm_rsqrt_ps(a) _mm_rsqrt_ps(a)
#define mm_blendv_ps(a, b, c) _mm_blendv_ps(a, b, c)
#define mm_cmpeq_epi32(a, b) _mm_cmpeq_epi32(a, b)
#endif
#define SIMD_32B_LANES (SIMD_BITS/32)
#define mm_is_zero(x) mm_is_zero((__simdi)x)

View File

@ -3,7 +3,7 @@
#include "rprof.h" #include "rprof.h"
#include "boid-list.hpp" #include "boid-list.hpp"
#include <immintrin.h> #include "simd.cpp"
static float vector2_atan2(Vector2 a) { static float vector2_atan2(Vector2 a) {
return std::atan2(a.y, a.x); return std::atan2(a.y, a.x);
@ -175,198 +175,92 @@ static int nearest_multiple(int num, int divisor) {
return (num / divisor + (num % divisor > 0 ? 1 : 0)) * divisor; return (num / divisor + (num % divisor > 0 ? 1 : 0)) * divisor;
} }
// b2l = boid to (list of boids) comparison
static void assign_local_boids_b2l(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t *to_boids, uboid_t to_boids_count) {
Boid *boids = world->boids.data();
// Simplified from: float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
float dot_threshold = cosf(world->view_angle/2);
float view_radius_sqr = world->view_radius * world->view_radius;
for (int i = 0; i < to_boids_count; i++) {
uint16_t to_boid = to_boids[i];
assert(to_boid != from_boid);
Vector2 offset = Vector2Subtract(boids[from_boid].pos, boids[to_boid].pos);
float length_sqr = Vector2LengthSqr(offset);
if (length_sqr > view_radius_sqr) continue;
Vector2 normalized = offset;
if (length_sqr != 0)
{
float ilength = 1.0f/sqrtf(length_sqr);
normalized.x *= ilength;
normalized.y *= ilength;
}
if (Vector2DotProduct(boids[from_boid].dir, Vector2Negate(normalized)) >= dot_threshold) {
boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
g_prof_interactions++;
}
if (Vector2DotProduct(boids[to_boid].dir, normalized) >= dot_threshold) {
boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
g_prof_interactions++;
}
}
}
#ifndef __EMSCRIPTEN__
static void print_m256_f32(__m256 value) {
float *value_f32 = (float*)&value;
printf("%f", value_f32[0]);
for (int i = 1; i < 8; i++) {
printf(",%f", value_f32[i]);
}
printf("\n");
}
static inline bool mm256_is_zero(__m256i x) {
return _mm256_testz_si256(x, x);
}
static inline bool mm256_is_zero(__m256 x) {
return mm256_is_zero((__m256i)x);
}
struct boid_pair { struct boid_pair {
uboid_t from; uboid_t from;
uboid_t to; uboid_t to;
}; };
static void world_process_local_boid_pairs(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) { #ifdef SIMD256
#define GET_F32_CHUNK_FROM_BOIDS(i, SIDE, FIELD) \
_mm256_set_ps( \
boids[b2b_cmps[8*i+7].SIDE].FIELD, \
boids[b2b_cmps[8*i+6].SIDE].FIELD, \
boids[b2b_cmps[8*i+5].SIDE].FIELD, \
boids[b2b_cmps[8*i+4].SIDE].FIELD, \
boids[b2b_cmps[8*i+3].SIDE].FIELD, \
boids[b2b_cmps[8*i+2].SIDE].FIELD, \
boids[b2b_cmps[8*i+1].SIDE].FIELD, \
boids[b2b_cmps[8*i+0].SIDE].FIELD \
)
#else
#define GET_F32_CHUNK_FROM_BOIDS(i, SIDE, FIELD) \
_mm_set_ps( \
boids[b2b_cmps[4*i+3].SIDE].FIELD, \
boids[b2b_cmps[4*i+2].SIDE].FIELD, \
boids[b2b_cmps[4*i+1].SIDE].FIELD, \
boids[b2b_cmps[4*i+0].SIDE].FIELD \
)
#endif
static void world_calc_distances_and_angles(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) {
RPROF_START("Calc dot products and distances");
// Simplified from: // Simplified from:
// float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2)); // float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
// | // |
// v // v
// float dot_threshold = cosf(world->view_angle/2); // float dot_threshold = cosf(world->view_angle/2);
__m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2)); __simd dot_threshold = mm_set1_ps(cosf(world->view_angle/2));
float view_radius_sqr = world->view_radius * world->view_radius; float view_radius_sqr = world->view_radius * world->view_radius;
__m256 view_radius = _mm256_set1_ps(view_radius_sqr); __simd view_radius = mm_set1_ps(view_radius_sqr);
__m256 zero = _mm256_set1_ps(0); __simd zero = mm_set1_ps(0);
__m256 negative_one = _mm256_set1_ps(-1); __simd negative_one = mm_set1_ps(-1);
Boid *boids = world->boids.data(); Boid *boids = world->boids.data();
int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32); int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*SIMD_32B_LANES, 32);
int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32); int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*SIMD_32B_LANES, 32);
int simd_iteration_count = nearest_multiple(*b2b_cmps_count, 8)/8; int simd_iteration_count = nearest_multiple(*b2b_cmps_count, SIMD_32B_LANES)/SIMD_32B_LANES;
for (int i = 0; i < simd_iteration_count; i++) { for (int i = 0; i < simd_iteration_count; i++) {
__m256 from_pos_x = _mm256_set_ps( __simd from_pos_x = GET_F32_CHUNK_FROM_BOIDS(i, from, pos.x);
boids[b2b_cmps[8*i+7].from].pos.x, __simd from_pos_y = GET_F32_CHUNK_FROM_BOIDS(i, from, pos.y);
boids[b2b_cmps[8*i+6].from].pos.x,
boids[b2b_cmps[8*i+5].from].pos.x,
boids[b2b_cmps[8*i+4].from].pos.x,
boids[b2b_cmps[8*i+3].from].pos.x,
boids[b2b_cmps[8*i+2].from].pos.x,
boids[b2b_cmps[8*i+1].from].pos.x,
boids[b2b_cmps[8*i+0].from].pos.x
);
__m256 from_pos_y = _mm256_set_ps( __simd to_pos_x = GET_F32_CHUNK_FROM_BOIDS(i, to, pos.x);
boids[b2b_cmps[8*i+7].from].pos.y, __simd to_pos_y = GET_F32_CHUNK_FROM_BOIDS(i, to, pos.y);
boids[b2b_cmps[8*i+6].from].pos.y,
boids[b2b_cmps[8*i+5].from].pos.y,
boids[b2b_cmps[8*i+4].from].pos.y,
boids[b2b_cmps[8*i+3].from].pos.y,
boids[b2b_cmps[8*i+2].from].pos.y,
boids[b2b_cmps[8*i+1].from].pos.y,
boids[b2b_cmps[8*i+0].from].pos.y
);
__m256 to_pos_x = _mm256_set_ps( __simd sub_x = mm_sub_ps(from_pos_x, to_pos_x);
boids[b2b_cmps[8*i+7].to].pos.x, __simd sub_y = mm_sub_ps(from_pos_y, to_pos_y);
boids[b2b_cmps[8*i+6].to].pos.x,
boids[b2b_cmps[8*i+5].to].pos.x,
boids[b2b_cmps[8*i+4].to].pos.x,
boids[b2b_cmps[8*i+3].to].pos.x,
boids[b2b_cmps[8*i+2].to].pos.x,
boids[b2b_cmps[8*i+1].to].pos.x,
boids[b2b_cmps[8*i+0].to].pos.x
);
__m256 to_pos_y = _mm256_set_ps( __simd length_sqr = mm_fmadd_ps(sub_y, sub_y, mm_mul_ps(sub_x, sub_x));
boids[b2b_cmps[8*i+7].to].pos.y, __simdi in_range_mask = (__simdi)mm_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
boids[b2b_cmps[8*i+6].to].pos.y, if (mm_is_zero(in_range_mask)) continue;
boids[b2b_cmps[8*i+5].to].pos.y,
boids[b2b_cmps[8*i+4].to].pos.y,
boids[b2b_cmps[8*i+3].to].pos.y,
boids[b2b_cmps[8*i+2].to].pos.y,
boids[b2b_cmps[8*i+1].to].pos.y,
boids[b2b_cmps[8*i+0].to].pos.y
);
__m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x); __simd from_dir_x = GET_F32_CHUNK_FROM_BOIDS(i, from, dir.x);
__m256 sub_y = _mm256_sub_ps(from_pos_y, to_pos_y); __simd from_dir_y = GET_F32_CHUNK_FROM_BOIDS(i, from, dir.y);
__m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, _mm256_mul_ps(sub_x, sub_x)); __simd to_dir_x = GET_F32_CHUNK_FROM_BOIDS(i, to, dir.x);
__m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ); __simd to_dir_y = GET_F32_CHUNK_FROM_BOIDS(i, to, dir.y);
if (mm256_is_zero(in_range_mask)) continue;
__m256 from_dir_x = _mm256_set_ps( __simd is_length_zero = (__simd)mm_cmpeq_epi32((__simdi)length_sqr, (__simdi)zero);
boids[b2b_cmps[8*i+7].from].dir.x, __simd ilength = mm_blendv_ps(mm_rsqrt_ps(length_sqr), zero, is_length_zero);
boids[b2b_cmps[8*i+6].from].dir.x,
boids[b2b_cmps[8*i+5].from].dir.x,
boids[b2b_cmps[8*i+4].from].dir.x,
boids[b2b_cmps[8*i+3].from].dir.x,
boids[b2b_cmps[8*i+2].from].dir.x,
boids[b2b_cmps[8*i+1].from].dir.x,
boids[b2b_cmps[8*i+0].from].dir.x
);
__m256 from_dir_y = _mm256_set_ps( __simd x_norm = mm_mul_ps(sub_x, ilength);
boids[b2b_cmps[8*i+7].from].dir.y, __simd y_norm = mm_mul_ps(sub_y, ilength);
boids[b2b_cmps[8*i+6].from].dir.y,
boids[b2b_cmps[8*i+5].from].dir.y,
boids[b2b_cmps[8*i+4].from].dir.y,
boids[b2b_cmps[8*i+3].from].dir.y,
boids[b2b_cmps[8*i+2].from].dir.y,
boids[b2b_cmps[8*i+1].from].dir.y,
boids[b2b_cmps[8*i+0].from].dir.y
);
__m256 to_dir_x = _mm256_set_ps( __simd x_neg_norm = mm_mul_ps(x_norm, negative_one);
boids[b2b_cmps[8*i+7].to].dir.x, __simd y_neg_norm = mm_mul_ps(y_norm, negative_one);
boids[b2b_cmps[8*i+6].to].dir.x,
boids[b2b_cmps[8*i+5].to].dir.x,
boids[b2b_cmps[8*i+4].to].dir.x,
boids[b2b_cmps[8*i+3].to].dir.x,
boids[b2b_cmps[8*i+2].to].dir.x,
boids[b2b_cmps[8*i+1].to].dir.x,
boids[b2b_cmps[8*i+0].to].dir.x
);
__m256 to_dir_y = _mm256_set_ps( __simd dot_product1 = mm_fmadd_ps(from_dir_y, y_neg_norm, mm_mul_ps(from_dir_x, x_neg_norm));
boids[b2b_cmps[8*i+7].to].dir.y, __simd in_angle_mask1 = mm_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
boids[b2b_cmps[8*i+6].to].dir.y, __simd do_append_mask1 = mm_and_ps(in_angle_mask1, (__simd)in_range_mask);
boids[b2b_cmps[8*i+5].to].dir.y,
boids[b2b_cmps[8*i+4].to].dir.y,
boids[b2b_cmps[8*i+3].to].dir.y,
boids[b2b_cmps[8*i+2].to].dir.y,
boids[b2b_cmps[8*i+1].to].dir.y,
boids[b2b_cmps[8*i+0].to].dir.y
);
__m256 is_length_zero = (__m256)_mm256_cmpeq_epi32((__m256i)length_sqr, (__m256i)zero); __simd dot_product2 = mm_fmadd_ps(to_dir_y, y_norm, mm_mul_ps(to_dir_x, x_norm));
__m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero); __simd in_angle_mask2 = mm_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
__simd do_append_mask2 = mm_and_ps(in_angle_mask2, (__simd)in_range_mask);
__m256 x_norm = _mm256_mul_ps(sub_x, ilength); mm_store_ps((float*)do_append_mask1_f32, do_append_mask1);
__m256 y_norm = _mm256_mul_ps(sub_y, ilength); mm_store_ps((float*)do_append_mask2_f32, do_append_mask2);
for (int j = 0; j < SIMD_32B_LANES; j++) {
__m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one); uboid_t cmp_idx = SIMD_32B_LANES*i + j;
__m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one);
__m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm));
__m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
__m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask);
__m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm));
__m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
__m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask);
_mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1);
_mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2);
for (int j = 0; j < 8; j++) {
uboid_t cmp_idx = 8*i + j;
if (cmp_idx >= *b2b_cmps_count) break; if (cmp_idx >= *b2b_cmps_count) break;
uboid_t from_boid = b2b_cmps[cmp_idx].from; uboid_t from_boid = b2b_cmps[cmp_idx].from;
@ -383,9 +277,10 @@ static void world_process_local_boid_pairs(World *world, BoidList *local_boids,
} }
*b2b_cmps_count = 0; *b2b_cmps_count = 0;
RPROF_STOP();
} }
static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) { static void world_compute_local_boids(BoidList *local_boids, World *world, ChunkGrid *chunks) {
Boid *boids = world->boids.data(); Boid *boids = world->boids.data();
int boid_count = world->boids.size(); int boid_count = world->boids.size();
MemoryArena *arena = &world->frame_arena; MemoryArena *arena = &world->frame_arena;
@ -410,7 +305,7 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world,
} }
RPROF_STOP(); RPROF_STOP();
RPROF_START("Calc dot products and ranges (simd)"); RPROF_START("Iterate over chunks");
for (int y = 0; y < chunks->height; y++) { for (int y = 0; y < chunks->height; y++) {
for (int x = 0; x < chunks->width; x++) { for (int x = 0; x < chunks->width; x++) {
size_t chunk_idx = chunkgrid_get_idx(chunks, x, y); size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
@ -458,55 +353,13 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world,
} }
if (b2b_cmps_count > 2048*3) { if (b2b_cmps_count > 2048*3) {
world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count); world_calc_distances_and_angles(world, local_boids, b2b_cmps, &b2b_cmps_count);
} }
} }
} }
if (b2b_cmps_count > 0) { if (b2b_cmps_count > 0) {
world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count); world_calc_distances_and_angles(world, local_boids, b2b_cmps, &b2b_cmps_count);
}
RPROF_STOP();
}
#endif
static void world_compute_local_boids_scalar(BoidList *local_boids, World *world, ChunkGrid *chunks) {
RPROF_START("Calc dot products and ranges (scalar)");
for (int y = 0; y < chunks->height; y++) {
for (int x = 0; x < chunks->width; x++) {
BoidList *chunk = chunkgrid_get(chunks, x, y);
if (chunk->count == 0) continue;
uboid_t chunk_boids[chunk->count];
boid_list_to_array(chunk_boids, chunk);
for (int i = 0; i < chunk->count-1; i++) {
uboid_t from_boid = chunk_boids[i];
uboid_t *to_boids = &chunk_boids[i+1];
uboid_t to_boids_count = chunk->count-i-1;
assign_local_boids_b2l(world, local_boids, from_boid, to_boids, to_boids_count);
}
Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
for (int i = 0; i < ARRAY_LEN(neighbours); i++) {
int chunk_y = y + neighbours[i].y;
int chunk_x = x + neighbours[i].x;
if (chunk_y < 0 || chunk_y >= chunks->height) continue;
if (chunk_x < 0 || chunk_x >= chunks->width) continue;
BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y);
if (neighbour_chunk->count == 0) continue;
uboid_t neighbour_ids[neighbour_chunk->count];
boid_list_to_array(neighbour_ids, neighbour_chunk);
uboid_t boid1;
BoidsListNodeIterator it1 = boid_list_get_iterator(chunk);
while (boid_list_iterator_next(&it1, &boid1)) {
assign_local_boids_b2l(world, local_boids, boid1, neighbour_ids, neighbour_chunk->count);
}
}
}
} }
RPROF_STOP(); RPROF_STOP();
} }
@ -543,13 +396,7 @@ static BoidList* world_compute_local_boids(World *world) {
RPROF_STOP(); RPROF_STOP();
RPROF_START("world_compute_local_boids()"); RPROF_START("world_compute_local_boids()");
#ifdef __EMSCRIPTEN__ world_compute_local_boids(all_local_boids, world, &chunks);
// TODO: Rewrite simd version to only use SSE, not AVX2
world_compute_local_boids_scalar(all_local_boids, world, &chunks);
#else
world_compute_local_boids_simd(all_local_boids, world, &chunks);
#endif
RPROF_STOP(); RPROF_STOP();
return all_local_boids; return all_local_boids;