do some benchmarking

This commit is contained in:
Rokas Puzonas 2023-12-01 02:20:33 +02:00
parent e153201ff2
commit cb5ab7349d
2 changed files with 175 additions and 25 deletions

123
README.md
View File

@ -1,3 +1,126 @@
# Ray Tracing in One Weekend (CUDA edition) # Ray Tracing in One Weekend (CUDA edition)
Tutorial: https://github.com/rogerallen/raytracinginoneweekendincuda Tutorial: https://github.com/rogerallen/raytracinginoneweekendincuda
GPU Used: NVIDIA GeForce GTX 1660 Super
CUDA Cores: 1408
## Parameter sets
1:
* Resolution: 300x300
* Thread block size: 8x8
* Samples per pixel: 8
* Number of additional objects: 0
* Maximum number of bounces: 50
* Time to render: 0.0085s
---
2:
* Resolution: 600x600
* Thread block size: 4x4
* Samples per pixel: 50
* Number of additional objects: 200
* Maximum number of bounces: 50
* Time to render: 2.12857s
3:
* Resolution: 600x600
* Thread block size: 16x16
* Samples per pixel: 50
* Number of additional objects: 200
* Maximum number of bounces: 50
* Time to render: 2.4912s
4:
* Resolution: 600x600
* Thread block size: 24x24
* Samples per pixel: 50
* Number of additional objects: 200
* Maximum number of bounces: 50
* Time to render: 1.81483s
---
5:
* Resolution: 1000x1000
* Thread block size: 24x24
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 14.7335s
6:
* Resolution: 1000x1000
* Thread block size: 26x26
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 17.4273s
7:
* Resolution: 1000x1000
* Thread block size: 4x4
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 19.0303s
8:
* Resolution: 1000x1000
* Thread block size: 16x16
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 23.2295s
9:
* Resolution: 1000x1000
* Thread block size: 20x20
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 23.6136s
10:
* Resolution: 1000x1000
* Thread block size: 12x12
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 27.5289s
10:
* Resolution: 1000x1000
* Thread block size: 29x29
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 28.6761s
11:
* Resolution: 1000x1000
* Thread block size: 8x8
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 33.2001s
12:
* Resolution: 1000x1000
* Thread block size: 2x2
* Samples per pixel: 100
* Number of additional objects: 300
* Maximum number of bounces: 50
* Time to render: 55.2398s
---
13:
* Resolution: 2400x1200
* Thread block size: 16x16
* Samples per pixel: 100
* Number of additional objects: 500
* Maximum number of bounces: 50
* Time to render: 124.928s

View File

@ -12,6 +12,8 @@
#include "camera.cpp" #include "camera.cpp"
#include "hitable_list.cpp" #include "hitable_list.cpp"
#define MAX_BOUNCES 50
#define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__) #define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
void check_cuda(cudaError_t result, const char *func, const char *file, int line) { void check_cuda(cudaError_t result, const char *func, const char *file, int line) {
@ -35,7 +37,7 @@ __device__ bool hit_sphere(const vec3& center, float radius, const ray& r) {
__device__ vec3 color(const ray& r, hitable **world, curandState *local_rand_state) { __device__ vec3 color(const ray& r, hitable **world, curandState *local_rand_state) {
ray cur_ray = r; ray cur_ray = r;
vec3 cur_attenuation = vec3(1,1,1); vec3 cur_attenuation = vec3(1,1,1);
for (int i = 0; i < 50; i++) { for (int i = 0; i < MAX_BOUNCES; i++) {
hit_record rec; hit_record rec;
if ((*world)->hit(cur_ray, 0.001f, FLT_MAX, rec)) { if ((*world)->hit(cur_ray, 0.001f, FLT_MAX, rec)) {
ray scattered; ray scattered;
@ -90,7 +92,7 @@ __global__ void render(vec3 *fb,
ray r = (*cam)->get_ray(u,v, &local_rand_state); ray r = (*cam)->get_ray(u,v, &local_rand_state);
col += color(r, world, &local_rand_state); col += color(r, world, &local_rand_state);
} }
rand_state[pixel_idx] = local_rand_state; // rand_state[pixel_idx] = local_rand_state;
col /= float(ns); col /= float(ns);
col[0] = sqrt(col[0]); col[0] = sqrt(col[0]);
@ -99,7 +101,7 @@ __global__ void render(vec3 *fb,
fb[pixel_idx] = col; fb[pixel_idx] = col;
} }
__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state) { __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state, int ball_count) {
if (!(threadIdx.x == 0 && blockIdx.y == 0)) return; if (!(threadIdx.x == 0 && blockIdx.y == 0)) return;
curandState local_rand_state = *rand_state; curandState local_rand_state = *rand_state;
@ -108,10 +110,24 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl
#define RND() curand_uniform(&local_rand_state) #define RND() curand_uniform(&local_rand_state)
int idx = 1; int idx = 1;
for (int a = -11; a < 11; a++) { // for (int a = -11; a < 11; a++) {
for (int b = -11; b < 11; b++) { // for (int b = -11; b < 11; b++) {
// float choose_mat = RND();
// vec3 center(a + RND(), 0.2, b + RND());
// material *mat;
// if (choose_mat < 0.8f) {
// mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND()));
// } else if (choose_mat < 0.95f) {
// mat = new metal(vec3(0.5f*(1.0f+RND()), 0.5f*(1.0f+RND()), 0.5f*(1.0f+RND())), 0.5f*RND());
// } else {
// mat = new dielectric(1.5);
// }
// d_list[idx++] = new sphere(center, 0.2, mat);
// }
// }
for (int i = 0; i < ball_count; i++) {
float choose_mat = RND(); float choose_mat = RND();
vec3 center(a + RND(), 0.2, b + RND()); vec3 center(RND()*22.0f-11.0f, 0.2f, RND()*22.0f-11.0f);
material *mat; material *mat;
if (choose_mat < 0.8f) { if (choose_mat < 0.8f) {
mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND())); mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND()));
@ -121,7 +137,6 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl
mat = new dielectric(1.5); mat = new dielectric(1.5);
} }
d_list[idx++] = new sphere(center, 0.2, mat); d_list[idx++] = new sphere(center, 0.2, mat);
}
} }
#undef RND #undef RND
@ -160,11 +175,14 @@ __global__ void free_world(hitable **d_list, int d_list_size, hitable **d_world,
int main() { int main() {
const char *image_filename = "out.ppm"; const char *image_filename = "out.ppm";
int nx = 1200; int nx = 1000;
int ny = 600; int ny = 1000;
int ns = 20; int ns = 100;
int tx = 16; int tx = 24;
int ty = 16; int ty = 24;
int number_of_balls = 300;
int runs = 5;
std::cout << "Rendering a " << nx << "x" << ny << " image "; std::cout << "Rendering a " << nx << "x" << ny << " image ";
std::cout << "in " << tx << "x" << ty << " blocks.\n"; std::cout << "in " << tx << "x" << ty << " blocks.\n";
@ -183,37 +201,46 @@ int main() {
checkCudaErrors(cudaMalloc((void **)&d_rand_world_state, num_pixels*sizeof(curandState))); checkCudaErrors(cudaMalloc((void **)&d_rand_world_state, num_pixels*sizeof(curandState)));
// Initialize world rand state // Initialize world rand state
std::cout << "Initializing world random\n";
rand_init<<<1,1>>>(d_rand_world_state); rand_init<<<1,1>>>(d_rand_world_state);
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
// populate world // populate world
std::cout << "Populating world\n";
hitable **d_list; hitable **d_list;
int d_list_size = 22*22 + 1 + 3; int d_list_size = number_of_balls + 1 + 3;
checkCudaErrors(cudaMalloc((void **)&d_list, d_list_size*sizeof(hitable *))); checkCudaErrors(cudaMalloc((void **)&d_list, d_list_size*sizeof(hitable *)));
hitable **d_world; hitable **d_world;
checkCudaErrors(cudaMalloc((void **)&d_world, sizeof(hitable *))); checkCudaErrors(cudaMalloc((void **)&d_world, sizeof(hitable *)));
camera **d_camera; camera **d_camera;
checkCudaErrors(cudaMalloc((void **)&d_camera, sizeof(camera *))); checkCudaErrors(cudaMalloc((void **)&d_camera, sizeof(camera *)));
create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state); create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state, number_of_balls);
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
// Render frame buffer // Render frame buffer
clock_t start = clock(); std::cout << "Started rendering\n";
{ double average_time = 0;
dim3 blocks(nx/tx+1, ny/ty+1); for (int i = 0; i < runs; i++) {
dim3 threads(tx, ty); clock_t start = clock();
{
dim3 blocks(nx/tx+1, ny/ty+1);
dim3 threads(tx, ty);
render_init<<<blocks, threads>>>(nx, ny, d_rand_state); render_init<<<blocks, threads>>>(nx, ny, d_rand_state);
render<<<blocks, threads>>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state); render<<<blocks, threads>>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state);
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
}
clock_t stop = clock();
double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
std::cout << "took " << timer_seconds << " seconds.\n";
average_time += timer_seconds;
} }
clock_t stop = clock(); average_time /= runs;
double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC; std::cout << "average time: " << average_time << " seconds.\n";
std::cout << "took " << timer_seconds << " seconds.\n";
// Saveing frame buffer // Saveing frame buffer
FILE *f = fopen(image_filename, "w"); FILE *f = fopen(image_filename, "w");