From cb5ab7349d0f484fa77c63c8d5418db990a1a632 Mon Sep 17 00:00:00 2001
From: Rokas Puzonas <rokas.puz@gmail.com>
Date: Fri, 1 Dec 2023 02:20:33 +0200
Subject: [PATCH] do some benchmarking

---
 README.md   | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/main.cu |  77 +++++++++++++++++++++-----------
 2 files changed, 175 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index fcb1633..6588c4a 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,126 @@
 # Ray Tracing in One Weekend (CUDA edition)
 
 Tutorial: https://github.com/rogerallen/raytracinginoneweekendincuda
+
+GPU Used: NVIDIA GeForce GTX 1660 Super
+CUDA Cores: 1408
+
+## Parameter sets
+
+1:
+* Resolution: 300x300
+* Thread block size: 8x8
+* Samples per pixel: 8
+* Number of additional objects: 0
+* Maximum number of bounces: 50
+* Time to render: 0.0085s
+
+---
+
+2:
+* Resolution: 600x600
+* Thread block size: 4x4
+* Samples per pixel: 50
+* Number of additional objects: 200
+* Maximum number of bounces: 50
+* Time to render: 2.12857s
+
+3:
+* Resolution: 600x600
+* Thread block size: 16x16
+* Samples per pixel: 50
+* Number of additional objects: 200
+* Maximum number of bounces: 50
+* Time to render: 2.4912s
+
+4:
+* Resolution: 600x600
+* Thread block size: 24x24
+* Samples per pixel: 50
+* Number of additional objects: 200
+* Maximum number of bounces: 50
+* Time to render: 1.81483s
+
+---
+
+5:
+* Resolution: 1000x1000
+* Thread block size: 24x24
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 14.7335s
+
+6:
+* Resolution: 1000x1000
+* Thread block size: 26x26
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 17.4273s
+
+7:
+* Resolution: 1000x1000
+* Thread block size: 4x4
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 19.0303s
+
+8:
+* Resolution: 1000x1000
+* Thread block size: 16x16
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 23.2295s
+
+9:
+* Resolution: 1000x1000
+* Thread block size: 20x20
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 23.6136s
+
+10:
+* Resolution: 1000x1000
+* Thread block size: 12x12
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 27.5289s
+
+10:
+* Resolution: 1000x1000
+* Thread block size: 29x29
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 28.6761s
+
+11:
+* Resolution: 1000x1000
+* Thread block size: 8x8
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 33.2001s
+
+12:
+* Resolution: 1000x1000
+* Thread block size: 2x2
+* Samples per pixel: 100
+* Number of additional objects: 300
+* Maximum number of bounces: 50
+* Time to render: 55.2398s
+
+---
+
+13:
+* Resolution: 2400x1200
+* Thread block size: 16x16
+* Samples per pixel: 100
+* Number of additional objects: 500
+* Maximum number of bounces: 50
+* Time to render: 124.928s
diff --git a/src/main.cu b/src/main.cu
index 15da3a1..d03ef21 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -12,6 +12,8 @@
 #include "camera.cpp"
 #include "hitable_list.cpp"
 
+#define MAX_BOUNCES 50
+
 #define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
 
 void check_cuda(cudaError_t result, const char *func, const char *file, int line) {
@@ -35,7 +37,7 @@ __device__ bool hit_sphere(const vec3& center, float radius, const ray& r) {
 __device__ vec3 color(const ray& r, hitable **world, curandState *local_rand_state) {
     ray cur_ray = r;
     vec3 cur_attenuation = vec3(1,1,1);
-    for (int i = 0; i < 50; i++) {
+    for (int i = 0; i < MAX_BOUNCES; i++) {
         hit_record rec;
         if ((*world)->hit(cur_ray, 0.001f, FLT_MAX, rec)) {
             ray scattered;
@@ -90,7 +92,7 @@ __global__ void render(vec3 *fb,
         ray r = (*cam)->get_ray(u,v, &local_rand_state);
         col += color(r, world, &local_rand_state);
     }
-    rand_state[pixel_idx] = local_rand_state;
+    // rand_state[pixel_idx] = local_rand_state;
 
     col /= float(ns);
     col[0] = sqrt(col[0]);
@@ -99,7 +101,7 @@ __global__ void render(vec3 *fb,
     fb[pixel_idx] = col;
 }
 
-__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state) {
+__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state, int ball_count) {
     if (!(threadIdx.x == 0 && blockIdx.y == 0)) return;
 
     curandState local_rand_state = *rand_state;
@@ -108,10 +110,24 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl
     #define RND() curand_uniform(&local_rand_state)
 
     int idx = 1;
-    for (int a = -11; a < 11; a++) {
-        for (int b = -11; b < 11; b++) {
+    // for (int a = -11; a < 11; a++) {
+    //     for (int b = -11; b < 11; b++) {
+    //         float choose_mat = RND();
+    //         vec3 center(a + RND(), 0.2, b + RND());
+    //         material *mat;
+    //         if (choose_mat < 0.8f) {
+    //             mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND()));
+    //         } else if (choose_mat < 0.95f) {
+    //             mat = new metal(vec3(0.5f*(1.0f+RND()), 0.5f*(1.0f+RND()), 0.5f*(1.0f+RND())), 0.5f*RND());
+    //         } else {
+    //             mat = new dielectric(1.5);
+    //         }
+    //         d_list[idx++] = new sphere(center, 0.2, mat);
+    //     }
+    // }
+    for (int i = 0; i < ball_count; i++) {
             float choose_mat = RND();
-            vec3 center(a + RND(), 0.2, b + RND());
+            vec3 center(RND()*22.0f-11.0f, 0.2f, RND()*22.0f-11.0f);
             material *mat;
             if (choose_mat < 0.8f) {
                 mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND()));
@@ -121,7 +137,6 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl
                 mat = new dielectric(1.5);
             }
             d_list[idx++] = new sphere(center, 0.2, mat);
-        }
     }
 
     #undef RND
@@ -160,11 +175,14 @@ __global__ void free_world(hitable **d_list, int d_list_size, hitable **d_world,
 
 int main() {
     const char *image_filename = "out.ppm";
-    int nx = 1200;
-    int ny = 600;
-    int ns = 20;
-    int tx = 16;
-    int ty = 16;
+    int nx = 1000;
+    int ny = 1000;
+    int ns = 100;
+    int tx = 24;
+    int ty = 24;
+    int number_of_balls = 300;
+
+    int runs = 5;
 
     std::cout << "Rendering a " << nx << "x" << ny << " image ";
     std::cout << "in " << tx << "x" << ty << " blocks.\n";
@@ -183,37 +201,46 @@ int main() {
     checkCudaErrors(cudaMalloc((void **)&d_rand_world_state, num_pixels*sizeof(curandState)));
 
     // Initialize world rand state
+    std::cout << "Initializing world random\n";
     rand_init<<<1,1>>>(d_rand_world_state);
     checkCudaErrors(cudaGetLastError());
     checkCudaErrors(cudaDeviceSynchronize());
 
     // populate world
+    std::cout << "Populating world\n";
     hitable **d_list;
-    int d_list_size = 22*22 + 1 + 3;
+    int d_list_size = number_of_balls + 1 + 3;
     checkCudaErrors(cudaMalloc((void **)&d_list, d_list_size*sizeof(hitable *)));
     hitable **d_world;
     checkCudaErrors(cudaMalloc((void **)&d_world, sizeof(hitable *)));
     camera **d_camera;
     checkCudaErrors(cudaMalloc((void **)&d_camera, sizeof(camera *)));
-    create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state);
+    create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state, number_of_balls);
     checkCudaErrors(cudaGetLastError());
     checkCudaErrors(cudaDeviceSynchronize());
 
     // Render frame buffer
-    clock_t start = clock();
-    {
-        dim3 blocks(nx/tx+1, ny/ty+1);
-        dim3 threads(tx, ty);
+    std::cout << "Started rendering\n";
+    double average_time = 0;
+    for (int i = 0; i < runs; i++) {
+        clock_t start = clock();
+        {
+            dim3 blocks(nx/tx+1, ny/ty+1);
+            dim3 threads(tx, ty);
 
-        render_init<<<blocks, threads>>>(nx, ny, d_rand_state);
+            render_init<<<blocks, threads>>>(nx, ny, d_rand_state);
 
-        render<<<blocks, threads>>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state);
-        checkCudaErrors(cudaGetLastError());
-        checkCudaErrors(cudaDeviceSynchronize());
+            render<<<blocks, threads>>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state);
+            checkCudaErrors(cudaGetLastError());
+            checkCudaErrors(cudaDeviceSynchronize());
+        }
+        clock_t stop = clock();
+        double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
+        std::cout << "took " << timer_seconds << " seconds.\n";
+        average_time += timer_seconds;
     }
-    clock_t stop = clock();
-    double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
-    std::cout << "took " << timer_seconds << " seconds.\n";
+    average_time /= runs;
+    std::cout << "average time: " << average_time << " seconds.\n";
 
     // Saveing frame buffer
     FILE *f = fopen(image_filename, "w");