generated from rpuzonas/raylib-cpp-template
add define to switch between simd 128 and 256
This commit is contained in:
parent
d25d8b5490
commit
be2c56fb28
5
Makefile
5
Makefile
@ -14,7 +14,7 @@ WEB_HEAP_SIZE := 335544320
|
|||||||
WEB_STACK_SIZE := 196608
|
WEB_STACK_SIZE := 196608
|
||||||
WEB_SHELL := src/shell.html
|
WEB_SHELL := src/shell.html
|
||||||
|
|
||||||
COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g -flto
|
COMPILER_FLAGS := -std=c++17 -Wno-enum-compare -O3 -g -flto -msse4.2 -mavx
|
||||||
COMPILER_FLAGS += -DRPROF_IMPLEMENTATION
|
COMPILER_FLAGS += -DRPROF_IMPLEMENTATION
|
||||||
COMPILER_FLAGS += -DRAYGUI_IMPLEMENTATION
|
COMPILER_FLAGS += -DRAYGUI_IMPLEMENTATION
|
||||||
# COMPILER_FLAGS += -DRLGL_IMPLEMENTATION
|
# COMPILER_FLAGS += -DRLGL_IMPLEMENTATION
|
||||||
@ -75,6 +75,7 @@ ifeq ($(PLATFORM), web)
|
|||||||
EMSCRIPTEN_PATH ?= $(EMSDK_PATH)/upstream/emscripten
|
EMSCRIPTEN_PATH ?= $(EMSDK_PATH)/upstream/emscripten
|
||||||
COMPILER_FLAGS += -I$(EMSCRIPTEN_PATH)/cache/sysroot/include
|
COMPILER_FLAGS += -I$(EMSCRIPTEN_PATH)/cache/sysroot/include
|
||||||
COMPILER_FLAGS += -D_DEFAULT_SOURCE
|
COMPILER_FLAGS += -D_DEFAULT_SOURCE
|
||||||
|
COMPILER_FLAGS += -msimd128
|
||||||
LINKER_FLAGS += -s USE_GLFW=3
|
LINKER_FLAGS += -s USE_GLFW=3
|
||||||
LINKER_FLAGS += -s FORCE_FILESYSTEM=1
|
LINKER_FLAGS += -s FORCE_FILESYSTEM=1
|
||||||
LINKER_FLAGS += $(RAYLIB_RELEASE_PATH)/libraylib.a
|
LINKER_FLAGS += $(RAYLIB_RELEASE_PATH)/libraylib.a
|
||||||
@ -83,7 +84,7 @@ ifeq ($(PLATFORM), web)
|
|||||||
LINKER_FLAGS += -s STACK_SIZE=$(WEB_STACK_SIZE)
|
LINKER_FLAGS += -s STACK_SIZE=$(WEB_STACK_SIZE)
|
||||||
LIB_DEPENDENCIES += emsdk
|
LIB_DEPENDENCIES += emsdk
|
||||||
else
|
else
|
||||||
COMPILER_FLAGS += -march=native
|
COMPILER_FLAGS += -mavx2 -mfma -DSIMD256
|
||||||
endif
|
endif
|
||||||
|
|
||||||
LINKER_FLAGS += -L$(RAYLIB_RELEASE_PATH)
|
LINKER_FLAGS += -L$(RAYLIB_RELEASE_PATH)
|
||||||
|
15
src/main.cpp
15
src/main.cpp
@ -19,10 +19,6 @@
|
|||||||
#include "world.cpp"
|
#include "world.cpp"
|
||||||
#include "ui.cpp"
|
#include "ui.cpp"
|
||||||
|
|
||||||
//#define USE_TEST_MAIN
|
|
||||||
|
|
||||||
// #include "raygui.h"
|
|
||||||
|
|
||||||
#define FRAMERATE 60
|
#define FRAMERATE 60
|
||||||
#define TIME_PER_FRAME (1.0/FRAMERATE)
|
#define TIME_PER_FRAME (1.0/FRAMERATE)
|
||||||
|
|
||||||
@ -37,8 +33,8 @@ void UpdateDrawFrame();
|
|||||||
static void profiling_test();
|
static void profiling_test();
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
// profiling_test();
|
profiling_test();
|
||||||
// return 0;
|
return 0;
|
||||||
|
|
||||||
SetTraceLogLevel(LOG_TRACE);
|
SetTraceLogLevel(LOG_TRACE);
|
||||||
|
|
||||||
@ -89,7 +85,7 @@ static void profiling_test() {
|
|||||||
SetRandomSeed(10);
|
SetRandomSeed(10);
|
||||||
|
|
||||||
float border = g_visuals.boid_edge_size;
|
float border = g_visuals.boid_edge_size;
|
||||||
for (int i = 0; i < 45000; i++) {
|
for (int i = 0; i < 50000; i++) {
|
||||||
Boid boid;
|
Boid boid;
|
||||||
boid_rand_init(&g_world, &boid, border);
|
boid_rand_init(&g_world, &boid, border);
|
||||||
g_world.boids.push_back(boid);
|
g_world.boids.push_back(boid);
|
||||||
@ -105,8 +101,9 @@ static void profiling_test() {
|
|||||||
rprof_end();
|
rprof_end();
|
||||||
|
|
||||||
printf("interactions: %d\n", g_prof_interactions);
|
printf("interactions: %d\n", g_prof_interactions);
|
||||||
if (g_prof_interactions != 33119854) { // 22 051 739
|
int expected_interactions = 40501984;
|
||||||
printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", g_prof_interactions - 33119854);
|
if (g_prof_interactions != expected_interactions) { // 22 051 739
|
||||||
|
printf("!!!!!! ITERACTIONS DONT MATCH, %d\n", g_prof_interactions - expected_interactions);
|
||||||
}
|
}
|
||||||
|
|
||||||
rprof_output(NULL);
|
rprof_output(NULL);
|
||||||
|
@ -15,9 +15,9 @@
|
|||||||
<meta property="og:title" content="Boid playground">
|
<meta property="og:title" content="Boid playground">
|
||||||
<meta property="og:image:type" content="image/png">
|
<meta property="og:image:type" content="image/png">
|
||||||
<meta property="og:image" content="https://www.raylib.com/common/img/raylib_logo.png">
|
<meta property="og:image" content="https://www.raylib.com/common/img/raylib_logo.png">
|
||||||
<meta property="og:site_name" content="raylib.com">
|
<!-- <meta property="og:site_name" content="rpuzonas.com"> -->
|
||||||
<meta property="og:url" content="https://www.raylib.com/games.html">
|
<!-- <meta property="og:url" content="https://www.raylib.com/games.html"> -->
|
||||||
<meta property="og:description" content="New raylib web videogame, developed using raylib videogames library">
|
<meta property="og:description" content="Play around and figure out how boids work">
|
||||||
|
|
||||||
<!-- Favicon -->
|
<!-- Favicon -->
|
||||||
<link rel="shortcut icon" href="https://www.raylib.com/favicon.ico">
|
<link rel="shortcut icon" href="https://www.raylib.com/favicon.ico">
|
||||||
|
46
src/simd.cpp
Normal file
46
src/simd.cpp
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#include <stdbool.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#ifdef SIMD256
|
||||||
|
#define SIMD_BITS 256
|
||||||
|
typedef __m256 __simd;
|
||||||
|
typedef __m256i __simdi;
|
||||||
|
|
||||||
|
static inline bool mm_is_zero(__simdi x) {
|
||||||
|
return _mm256_testz_si256(x, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define mm_set1_ps(x) _mm256_set1_ps(x)
|
||||||
|
#define mm_sub_ps(a, b) _mm256_sub_ps(a, b)
|
||||||
|
#define mm_mul_ps(a, b) _mm256_mul_ps(a, b)
|
||||||
|
#define mm_and_ps(a, b) _mm256_and_ps(a, b)
|
||||||
|
#define mm_store_ps(a, b) _mm256_store_ps(a, b)
|
||||||
|
#define mm_fmadd_ps(a, b, c) _mm256_fmadd_ps(a, b, c)
|
||||||
|
#define mm_cmp_ps(a, b, c) _mm256_cmp_ps(a, b, c)
|
||||||
|
#define mm_rsqrt_ps(a) _mm256_rsqrt_ps(a)
|
||||||
|
#define mm_blendv_ps(a, b, c) _mm256_blendv_ps(a, b, c)
|
||||||
|
#define mm_cmpeq_epi32(a, b) _mm256_cmpeq_epi32(a, b)
|
||||||
|
#else
|
||||||
|
#define SIMD_BITS 128
|
||||||
|
typedef __m128 __simd;
|
||||||
|
typedef __m128i __simdi;
|
||||||
|
|
||||||
|
static inline bool mm_is_zero(__simdi x) {
|
||||||
|
return _mm_test_all_zeros(x, _mm_set1_epi8(0xFF));
|
||||||
|
}
|
||||||
|
|
||||||
|
#define mm_set1_ps(x) _mm_set1_ps(x)
|
||||||
|
#define mm_sub_ps(a, b) _mm_sub_ps(a, b)
|
||||||
|
#define mm_mul_ps(a, b) _mm_mul_ps(a, b)
|
||||||
|
#define mm_and_ps(a, b) _mm_and_ps(a, b)
|
||||||
|
#define mm_store_ps(a, b) _mm_store_ps(a, b)
|
||||||
|
#define mm_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
|
||||||
|
#define mm_cmp_ps(a, b, c) _mm_cmp_ps(a, b, c)
|
||||||
|
#define mm_rsqrt_ps(a) _mm_rsqrt_ps(a)
|
||||||
|
#define mm_blendv_ps(a, b, c) _mm_blendv_ps(a, b, c)
|
||||||
|
#define mm_cmpeq_epi32(a, b) _mm_cmpeq_epi32(a, b)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SIMD_32B_LANES (SIMD_BITS/32)
|
||||||
|
|
||||||
|
#define mm_is_zero(x) mm_is_zero((__simdi)x)
|
285
src/world.cpp
285
src/world.cpp
@ -3,7 +3,7 @@
|
|||||||
#include "rprof.h"
|
#include "rprof.h"
|
||||||
#include "boid-list.hpp"
|
#include "boid-list.hpp"
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include "simd.cpp"
|
||||||
|
|
||||||
static float vector2_atan2(Vector2 a) {
|
static float vector2_atan2(Vector2 a) {
|
||||||
return std::atan2(a.y, a.x);
|
return std::atan2(a.y, a.x);
|
||||||
@ -175,198 +175,92 @@ static int nearest_multiple(int num, int divisor) {
|
|||||||
return (num / divisor + (num % divisor > 0 ? 1 : 0)) * divisor;
|
return (num / divisor + (num % divisor > 0 ? 1 : 0)) * divisor;
|
||||||
}
|
}
|
||||||
|
|
||||||
// b2l = boid to (list of boids) comparison
|
|
||||||
static void assign_local_boids_b2l(World *world, BoidList *local_boids, uboid_t from_boid, uboid_t *to_boids, uboid_t to_boids_count) {
|
|
||||||
Boid *boids = world->boids.data();
|
|
||||||
|
|
||||||
// Simplified from: float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
|
|
||||||
float dot_threshold = cosf(world->view_angle/2);
|
|
||||||
float view_radius_sqr = world->view_radius * world->view_radius;
|
|
||||||
|
|
||||||
for (int i = 0; i < to_boids_count; i++) {
|
|
||||||
uint16_t to_boid = to_boids[i];
|
|
||||||
assert(to_boid != from_boid);
|
|
||||||
|
|
||||||
Vector2 offset = Vector2Subtract(boids[from_boid].pos, boids[to_boid].pos);
|
|
||||||
float length_sqr = Vector2LengthSqr(offset);
|
|
||||||
if (length_sqr > view_radius_sqr) continue;
|
|
||||||
|
|
||||||
Vector2 normalized = offset;
|
|
||||||
if (length_sqr != 0)
|
|
||||||
{
|
|
||||||
float ilength = 1.0f/sqrtf(length_sqr);
|
|
||||||
normalized.x *= ilength;
|
|
||||||
normalized.y *= ilength;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Vector2DotProduct(boids[from_boid].dir, Vector2Negate(normalized)) >= dot_threshold) {
|
|
||||||
boid_list_append(&world->frame_arena, &local_boids[from_boid], to_boid);
|
|
||||||
g_prof_interactions++;
|
|
||||||
}
|
|
||||||
if (Vector2DotProduct(boids[to_boid].dir, normalized) >= dot_threshold) {
|
|
||||||
boid_list_append(&world->frame_arena, &local_boids[to_boid], from_boid);
|
|
||||||
g_prof_interactions++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef __EMSCRIPTEN__
|
|
||||||
static void print_m256_f32(__m256 value) {
|
|
||||||
float *value_f32 = (float*)&value;
|
|
||||||
printf("%f", value_f32[0]);
|
|
||||||
for (int i = 1; i < 8; i++) {
|
|
||||||
printf(",%f", value_f32[i]);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool mm256_is_zero(__m256i x) {
|
|
||||||
return _mm256_testz_si256(x, x);
|
|
||||||
}
|
|
||||||
static inline bool mm256_is_zero(__m256 x) {
|
|
||||||
return mm256_is_zero((__m256i)x);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct boid_pair {
|
struct boid_pair {
|
||||||
uboid_t from;
|
uboid_t from;
|
||||||
uboid_t to;
|
uboid_t to;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void world_process_local_boid_pairs(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) {
|
#ifdef SIMD256
|
||||||
|
#define GET_F32_CHUNK_FROM_BOIDS(i, SIDE, FIELD) \
|
||||||
|
_mm256_set_ps( \
|
||||||
|
boids[b2b_cmps[8*i+7].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+6].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+5].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+4].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+3].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+2].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+1].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[8*i+0].SIDE].FIELD \
|
||||||
|
)
|
||||||
|
#else
|
||||||
|
#define GET_F32_CHUNK_FROM_BOIDS(i, SIDE, FIELD) \
|
||||||
|
_mm_set_ps( \
|
||||||
|
boids[b2b_cmps[4*i+3].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[4*i+2].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[4*i+1].SIDE].FIELD, \
|
||||||
|
boids[b2b_cmps[4*i+0].SIDE].FIELD \
|
||||||
|
)
|
||||||
|
#endif
|
||||||
|
static void world_calc_distances_and_angles(World *world, BoidList *local_boids, boid_pair *b2b_cmps, int *b2b_cmps_count) {
|
||||||
|
RPROF_START("Calc dot products and distances");
|
||||||
// Simplified from:
|
// Simplified from:
|
||||||
// float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
|
// float dot_threshold = Vector2DotProduct(dir, Vector2Rotate(dir, world->view_angle/2));
|
||||||
// |
|
// |
|
||||||
// v
|
// v
|
||||||
// float dot_threshold = cosf(world->view_angle/2);
|
// float dot_threshold = cosf(world->view_angle/2);
|
||||||
__m256 dot_threshold = _mm256_set1_ps(cosf(world->view_angle/2));
|
__simd dot_threshold = mm_set1_ps(cosf(world->view_angle/2));
|
||||||
|
|
||||||
float view_radius_sqr = world->view_radius * world->view_radius;
|
float view_radius_sqr = world->view_radius * world->view_radius;
|
||||||
__m256 view_radius = _mm256_set1_ps(view_radius_sqr);
|
__simd view_radius = mm_set1_ps(view_radius_sqr);
|
||||||
__m256 zero = _mm256_set1_ps(0);
|
__simd zero = mm_set1_ps(0);
|
||||||
__m256 negative_one = _mm256_set1_ps(-1);
|
__simd negative_one = mm_set1_ps(-1);
|
||||||
Boid *boids = world->boids.data();
|
Boid *boids = world->boids.data();
|
||||||
|
|
||||||
int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32);
|
int32_t *do_append_mask1_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*SIMD_32B_LANES, 32);
|
||||||
int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*8, 32);
|
int32_t *do_append_mask2_f32 = (int32_t*)arena_malloc(&world->frame_arena, sizeof(int32_t)*SIMD_32B_LANES, 32);
|
||||||
|
|
||||||
int simd_iteration_count = nearest_multiple(*b2b_cmps_count, 8)/8;
|
int simd_iteration_count = nearest_multiple(*b2b_cmps_count, SIMD_32B_LANES)/SIMD_32B_LANES;
|
||||||
for (int i = 0; i < simd_iteration_count; i++) {
|
for (int i = 0; i < simd_iteration_count; i++) {
|
||||||
__m256 from_pos_x = _mm256_set_ps(
|
__simd from_pos_x = GET_F32_CHUNK_FROM_BOIDS(i, from, pos.x);
|
||||||
boids[b2b_cmps[8*i+7].from].pos.x,
|
__simd from_pos_y = GET_F32_CHUNK_FROM_BOIDS(i, from, pos.y);
|
||||||
boids[b2b_cmps[8*i+6].from].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+5].from].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+4].from].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+3].from].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+2].from].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+1].from].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+0].from].pos.x
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 from_pos_y = _mm256_set_ps(
|
__simd to_pos_x = GET_F32_CHUNK_FROM_BOIDS(i, to, pos.x);
|
||||||
boids[b2b_cmps[8*i+7].from].pos.y,
|
__simd to_pos_y = GET_F32_CHUNK_FROM_BOIDS(i, to, pos.y);
|
||||||
boids[b2b_cmps[8*i+6].from].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+5].from].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+4].from].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+3].from].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+2].from].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+1].from].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+0].from].pos.y
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 to_pos_x = _mm256_set_ps(
|
__simd sub_x = mm_sub_ps(from_pos_x, to_pos_x);
|
||||||
boids[b2b_cmps[8*i+7].to].pos.x,
|
__simd sub_y = mm_sub_ps(from_pos_y, to_pos_y);
|
||||||
boids[b2b_cmps[8*i+6].to].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+5].to].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+4].to].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+3].to].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+2].to].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+1].to].pos.x,
|
|
||||||
boids[b2b_cmps[8*i+0].to].pos.x
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 to_pos_y = _mm256_set_ps(
|
__simd length_sqr = mm_fmadd_ps(sub_y, sub_y, mm_mul_ps(sub_x, sub_x));
|
||||||
boids[b2b_cmps[8*i+7].to].pos.y,
|
__simdi in_range_mask = (__simdi)mm_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
|
||||||
boids[b2b_cmps[8*i+6].to].pos.y,
|
if (mm_is_zero(in_range_mask)) continue;
|
||||||
boids[b2b_cmps[8*i+5].to].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+4].to].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+3].to].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+2].to].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+1].to].pos.y,
|
|
||||||
boids[b2b_cmps[8*i+0].to].pos.y
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 sub_x = _mm256_sub_ps(from_pos_x, to_pos_x);
|
__simd from_dir_x = GET_F32_CHUNK_FROM_BOIDS(i, from, dir.x);
|
||||||
__m256 sub_y = _mm256_sub_ps(from_pos_y, to_pos_y);
|
__simd from_dir_y = GET_F32_CHUNK_FROM_BOIDS(i, from, dir.y);
|
||||||
|
|
||||||
__m256 length_sqr = _mm256_fmadd_ps(sub_y, sub_y, _mm256_mul_ps(sub_x, sub_x));
|
__simd to_dir_x = GET_F32_CHUNK_FROM_BOIDS(i, to, dir.x);
|
||||||
__m256i in_range_mask = (__m256i)_mm256_cmp_ps(length_sqr, view_radius, _CMP_LE_OQ);
|
__simd to_dir_y = GET_F32_CHUNK_FROM_BOIDS(i, to, dir.y);
|
||||||
if (mm256_is_zero(in_range_mask)) continue;
|
|
||||||
|
|
||||||
__m256 from_dir_x = _mm256_set_ps(
|
__simd is_length_zero = (__simd)mm_cmpeq_epi32((__simdi)length_sqr, (__simdi)zero);
|
||||||
boids[b2b_cmps[8*i+7].from].dir.x,
|
__simd ilength = mm_blendv_ps(mm_rsqrt_ps(length_sqr), zero, is_length_zero);
|
||||||
boids[b2b_cmps[8*i+6].from].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+5].from].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+4].from].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+3].from].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+2].from].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+1].from].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+0].from].dir.x
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 from_dir_y = _mm256_set_ps(
|
__simd x_norm = mm_mul_ps(sub_x, ilength);
|
||||||
boids[b2b_cmps[8*i+7].from].dir.y,
|
__simd y_norm = mm_mul_ps(sub_y, ilength);
|
||||||
boids[b2b_cmps[8*i+6].from].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+5].from].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+4].from].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+3].from].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+2].from].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+1].from].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+0].from].dir.y
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 to_dir_x = _mm256_set_ps(
|
__simd x_neg_norm = mm_mul_ps(x_norm, negative_one);
|
||||||
boids[b2b_cmps[8*i+7].to].dir.x,
|
__simd y_neg_norm = mm_mul_ps(y_norm, negative_one);
|
||||||
boids[b2b_cmps[8*i+6].to].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+5].to].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+4].to].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+3].to].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+2].to].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+1].to].dir.x,
|
|
||||||
boids[b2b_cmps[8*i+0].to].dir.x
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 to_dir_y = _mm256_set_ps(
|
__simd dot_product1 = mm_fmadd_ps(from_dir_y, y_neg_norm, mm_mul_ps(from_dir_x, x_neg_norm));
|
||||||
boids[b2b_cmps[8*i+7].to].dir.y,
|
__simd in_angle_mask1 = mm_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
|
||||||
boids[b2b_cmps[8*i+6].to].dir.y,
|
__simd do_append_mask1 = mm_and_ps(in_angle_mask1, (__simd)in_range_mask);
|
||||||
boids[b2b_cmps[8*i+5].to].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+4].to].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+3].to].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+2].to].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+1].to].dir.y,
|
|
||||||
boids[b2b_cmps[8*i+0].to].dir.y
|
|
||||||
);
|
|
||||||
|
|
||||||
__m256 is_length_zero = (__m256)_mm256_cmpeq_epi32((__m256i)length_sqr, (__m256i)zero);
|
__simd dot_product2 = mm_fmadd_ps(to_dir_y, y_norm, mm_mul_ps(to_dir_x, x_norm));
|
||||||
__m256 ilength = _mm256_blendv_ps(_mm256_rsqrt_ps(length_sqr), zero, is_length_zero);
|
__simd in_angle_mask2 = mm_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
|
||||||
|
__simd do_append_mask2 = mm_and_ps(in_angle_mask2, (__simd)in_range_mask);
|
||||||
|
|
||||||
__m256 x_norm = _mm256_mul_ps(sub_x, ilength);
|
mm_store_ps((float*)do_append_mask1_f32, do_append_mask1);
|
||||||
__m256 y_norm = _mm256_mul_ps(sub_y, ilength);
|
mm_store_ps((float*)do_append_mask2_f32, do_append_mask2);
|
||||||
|
for (int j = 0; j < SIMD_32B_LANES; j++) {
|
||||||
__m256 x_neg_norm = _mm256_mul_ps(x_norm, negative_one);
|
uboid_t cmp_idx = SIMD_32B_LANES*i + j;
|
||||||
__m256 y_neg_norm = _mm256_mul_ps(y_norm, negative_one);
|
|
||||||
|
|
||||||
__m256 dot_product1 = _mm256_fmadd_ps(from_dir_y, y_neg_norm, _mm256_mul_ps(from_dir_x, x_neg_norm));
|
|
||||||
__m256 in_angle_mask1 = _mm256_cmp_ps(dot_product1, dot_threshold, _CMP_GE_OQ);
|
|
||||||
__m256 do_append_mask1 = _mm256_and_ps(in_angle_mask1, (__m256)in_range_mask);
|
|
||||||
|
|
||||||
__m256 dot_product2 = _mm256_fmadd_ps(to_dir_y, y_norm, _mm256_mul_ps(to_dir_x, x_norm));
|
|
||||||
__m256 in_angle_mask2 = _mm256_cmp_ps(dot_product2, dot_threshold, _CMP_GE_OQ);
|
|
||||||
__m256 do_append_mask2 = _mm256_and_ps(in_angle_mask2, (__m256)in_range_mask);
|
|
||||||
|
|
||||||
_mm256_store_ps((float*)do_append_mask1_f32, do_append_mask1);
|
|
||||||
_mm256_store_ps((float*)do_append_mask2_f32, do_append_mask2);
|
|
||||||
for (int j = 0; j < 8; j++) {
|
|
||||||
uboid_t cmp_idx = 8*i + j;
|
|
||||||
if (cmp_idx >= *b2b_cmps_count) break;
|
if (cmp_idx >= *b2b_cmps_count) break;
|
||||||
|
|
||||||
uboid_t from_boid = b2b_cmps[cmp_idx].from;
|
uboid_t from_boid = b2b_cmps[cmp_idx].from;
|
||||||
@ -383,9 +277,10 @@ static void world_process_local_boid_pairs(World *world, BoidList *local_boids,
|
|||||||
}
|
}
|
||||||
|
|
||||||
*b2b_cmps_count = 0;
|
*b2b_cmps_count = 0;
|
||||||
|
RPROF_STOP();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void world_compute_local_boids_simd(BoidList *local_boids, World *world, ChunkGrid *chunks) {
|
static void world_compute_local_boids(BoidList *local_boids, World *world, ChunkGrid *chunks) {
|
||||||
Boid *boids = world->boids.data();
|
Boid *boids = world->boids.data();
|
||||||
int boid_count = world->boids.size();
|
int boid_count = world->boids.size();
|
||||||
MemoryArena *arena = &world->frame_arena;
|
MemoryArena *arena = &world->frame_arena;
|
||||||
@ -410,7 +305,7 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world,
|
|||||||
}
|
}
|
||||||
RPROF_STOP();
|
RPROF_STOP();
|
||||||
|
|
||||||
RPROF_START("Calc dot products and ranges (simd)");
|
RPROF_START("Iterate over chunks");
|
||||||
for (int y = 0; y < chunks->height; y++) {
|
for (int y = 0; y < chunks->height; y++) {
|
||||||
for (int x = 0; x < chunks->width; x++) {
|
for (int x = 0; x < chunks->width; x++) {
|
||||||
size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
|
size_t chunk_idx = chunkgrid_get_idx(chunks, x, y);
|
||||||
@ -458,55 +353,13 @@ static void world_compute_local_boids_simd(BoidList *local_boids, World *world,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (b2b_cmps_count > 2048*3) {
|
if (b2b_cmps_count > 2048*3) {
|
||||||
world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count);
|
world_calc_distances_and_angles(world, local_boids, b2b_cmps, &b2b_cmps_count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (b2b_cmps_count > 0) {
|
if (b2b_cmps_count > 0) {
|
||||||
world_process_local_boid_pairs(world, local_boids, b2b_cmps, &b2b_cmps_count);
|
world_calc_distances_and_angles(world, local_boids, b2b_cmps, &b2b_cmps_count);
|
||||||
}
|
|
||||||
|
|
||||||
RPROF_STOP();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void world_compute_local_boids_scalar(BoidList *local_boids, World *world, ChunkGrid *chunks) {
|
|
||||||
RPROF_START("Calc dot products and ranges (scalar)");
|
|
||||||
for (int y = 0; y < chunks->height; y++) {
|
|
||||||
for (int x = 0; x < chunks->width; x++) {
|
|
||||||
BoidList *chunk = chunkgrid_get(chunks, x, y);
|
|
||||||
if (chunk->count == 0) continue;
|
|
||||||
|
|
||||||
uboid_t chunk_boids[chunk->count];
|
|
||||||
boid_list_to_array(chunk_boids, chunk);
|
|
||||||
for (int i = 0; i < chunk->count-1; i++) {
|
|
||||||
uboid_t from_boid = chunk_boids[i];
|
|
||||||
uboid_t *to_boids = &chunk_boids[i+1];
|
|
||||||
uboid_t to_boids_count = chunk->count-i-1;
|
|
||||||
assign_local_boids_b2l(world, local_boids, from_boid, to_boids, to_boids_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
Vector2 neighbours[] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } };
|
|
||||||
for (int i = 0; i < ARRAY_LEN(neighbours); i++) {
|
|
||||||
int chunk_y = y + neighbours[i].y;
|
|
||||||
int chunk_x = x + neighbours[i].x;
|
|
||||||
if (chunk_y < 0 || chunk_y >= chunks->height) continue;
|
|
||||||
if (chunk_x < 0 || chunk_x >= chunks->width) continue;
|
|
||||||
|
|
||||||
BoidList *neighbour_chunk = chunkgrid_get(chunks, chunk_x, chunk_y);
|
|
||||||
if (neighbour_chunk->count == 0) continue;
|
|
||||||
|
|
||||||
uboid_t neighbour_ids[neighbour_chunk->count];
|
|
||||||
boid_list_to_array(neighbour_ids, neighbour_chunk);
|
|
||||||
|
|
||||||
uboid_t boid1;
|
|
||||||
BoidsListNodeIterator it1 = boid_list_get_iterator(chunk);
|
|
||||||
while (boid_list_iterator_next(&it1, &boid1)) {
|
|
||||||
assign_local_boids_b2l(world, local_boids, boid1, neighbour_ids, neighbour_chunk->count);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
RPROF_STOP();
|
RPROF_STOP();
|
||||||
}
|
}
|
||||||
@ -543,13 +396,7 @@ static BoidList* world_compute_local_boids(World *world) {
|
|||||||
RPROF_STOP();
|
RPROF_STOP();
|
||||||
|
|
||||||
RPROF_START("world_compute_local_boids()");
|
RPROF_START("world_compute_local_boids()");
|
||||||
#ifdef __EMSCRIPTEN__
|
world_compute_local_boids(all_local_boids, world, &chunks);
|
||||||
// TODO: Rewrite simd version to only use SSE, not AVX2
|
|
||||||
world_compute_local_boids_scalar(all_local_boids, world, &chunks);
|
|
||||||
#else
|
|
||||||
world_compute_local_boids_simd(all_local_boids, world, &chunks);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
RPROF_STOP();
|
RPROF_STOP();
|
||||||
|
|
||||||
return all_local_boids;
|
return all_local_boids;
|
||||||
|
Loading…
Reference in New Issue
Block a user