do some benchmarking

2023-12-01 02:20:33 +02:00 · 2023-12-01 02:20:33 +02:00 · cb5ab7349d
commit cb5ab7349d
parent e153201ff2
2 changed files with 175 additions and 25 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,126 @@
 # Ray Tracing in One Weekend (CUDA edition)
 Tutorial: https://github.com/rogerallen/raytracinginoneweekendincuda
 GPU Used: NVIDIA GeForce GTX 1660 Super
 CUDA Cores: 1408
 ## Parameter sets
 1:
 * Resolution: 300x300
 * Thread block size: 8x8
 * Samples per pixel: 8
 * Number of additional objects: 0
 * Maximum number of bounces: 50
 * Time to render: 0.0085s
 ---
 2:
 * Resolution: 600x600
 * Thread block size: 4x4
 * Samples per pixel: 50
 * Number of additional objects: 200
 * Maximum number of bounces: 50
 * Time to render: 2.12857s
 3:
 * Resolution: 600x600
 * Thread block size: 16x16
 * Samples per pixel: 50
 * Number of additional objects: 200
 * Maximum number of bounces: 50
 * Time to render: 2.4912s
 4:
 * Resolution: 600x600
 * Thread block size: 24x24
 * Samples per pixel: 50
 * Number of additional objects: 200
 * Maximum number of bounces: 50
 * Time to render: 1.81483s
 ---
 5:
 * Resolution: 1000x1000
 * Thread block size: 24x24
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 14.7335s
 6:
 * Resolution: 1000x1000
 * Thread block size: 26x26
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 17.4273s
 7:
 * Resolution: 1000x1000
 * Thread block size: 4x4
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 19.0303s
 8:
 * Resolution: 1000x1000
 * Thread block size: 16x16
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 23.2295s
 9:
 * Resolution: 1000x1000
 * Thread block size: 20x20
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 23.6136s
 10:
 * Resolution: 1000x1000
 * Thread block size: 12x12
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 27.5289s
 10:
 * Resolution: 1000x1000
 * Thread block size: 29x29
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 28.6761s
 11:
 * Resolution: 1000x1000
 * Thread block size: 8x8
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 33.2001s
 12:
 * Resolution: 1000x1000
 * Thread block size: 2x2
 * Samples per pixel: 100
 * Number of additional objects: 300
 * Maximum number of bounces: 50
 * Time to render: 55.2398s
 ---
 13:
 * Resolution: 2400x1200
 * Thread block size: 16x16
 * Samples per pixel: 100
 * Number of additional objects: 500
 * Maximum number of bounces: 50
 * Time to render: 124.928s
--- a/src/main.cu
+++ b/src/main.cu
@ -12,6 +12,8 @@
 #include "camera.cpp"
 #include "hitable_list.cpp"
 #define MAX_BOUNCES 50
 #define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
 void check_cuda(cudaError_t result, const char *func, const char *file, int line) {
@ -35,7 +37,7 @@ __device__ bool hit_sphere(const vec3& center, float radius, const ray& r) {
 __device__ vec3 color(const ray& r, hitable **world, curandState *local_rand_state) {
    ray cur_ray = r;
    vec3 cur_attenuation = vec3(1,1,1);
-    for (int i = 0; i < 50; i++) {
+    for (int i = 0; i < MAX_BOUNCES; i++) {
        hit_record rec;
        if ((*world)->hit(cur_ray, 0.001f, FLT_MAX, rec)) {
            ray scattered;
@ -90,7 +92,7 @@ __global__ void render(vec3 *fb,
        ray r = (*cam)->get_ray(u,v, &local_rand_state);
        col += color(r, world, &local_rand_state);
    }
-    rand_state[pixel_idx] = local_rand_state;
+    // rand_state[pixel_idx] = local_rand_state;
    col /= float(ns);
    col[0] = sqrt(col[0]);
@ -99,7 +101,7 @@ __global__ void render(vec3 *fb,
    fb[pixel_idx] = col;
 }
-__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state) {
+__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state, int ball_count) {
    if (!(threadIdx.x == 0 && blockIdx.y == 0)) return;
    curandState local_rand_state = *rand_state;
@ -108,10 +110,24 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl
    #define RND() curand_uniform(&local_rand_state)
    int idx = 1;
-    for (int a = -11; a < 11; a++) {
+    // for (int a = -11; a < 11; a++) {
-        for (int b = -11; b < 11; b++) {
+    //     for (int b = -11; b < 11; b++) {
    //         float choose_mat = RND();
    //         vec3 center(a + RND(), 0.2, b + RND());
    //         material *mat;
    //         if (choose_mat < 0.8f) {
    //             mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND()));
    //         } else if (choose_mat < 0.95f) {
    //             mat = new metal(vec3(0.5f*(1.0f+RND()), 0.5f*(1.0f+RND()), 0.5f*(1.0f+RND())), 0.5f*RND());
    //         } else {
    //             mat = new dielectric(1.5);
    //         }
    //         d_list[idx++] = new sphere(center, 0.2, mat);
    //     }
    // }
    for (int i = 0; i < ball_count; i++) {
            float choose_mat = RND();
-            vec3 center(a + RND(), 0.2, b + RND());
+            vec3 center(RND()*22.0f-11.0f, 0.2f, RND()*22.0f-11.0f);
            material *mat;
            if (choose_mat < 0.8f) {
                mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND()));
@ -121,7 +137,6 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl
                mat = new dielectric(1.5);
            }
            d_list[idx++] = new sphere(center, 0.2, mat);
        }
    }
    #undef RND
@ -160,11 +175,14 @@ __global__ void free_world(hitable **d_list, int d_list_size, hitable **d_world,
 int main() {
    const char *image_filename = "out.ppm";
-    int nx = 1200;
+    int nx = 1000;
-    int ny = 600;
+    int ny = 1000;
-    int ns = 20;
+    int ns = 100;
-    int tx = 16;
+    int tx = 24;
-    int ty = 16;
+    int ty = 24;
    int number_of_balls = 300;
    int runs = 5;
    std::cout << "Rendering a " << nx << "x" << ny << " image ";
    std::cout << "in " << tx << "x" << ty << " blocks.\n";
@ -183,37 +201,46 @@ int main() {
    checkCudaErrors(cudaMalloc((void **)&d_rand_world_state, num_pixels*sizeof(curandState)));
    // Initialize world rand state
    std::cout << "Initializing world random\n";
    rand_init<<<1,1>>>(d_rand_world_state);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    // populate world
    std::cout << "Populating world\n";
    hitable **d_list;
-    int d_list_size = 22*22 + 1 + 3;
+    int d_list_size = number_of_balls + 1 + 3;
    checkCudaErrors(cudaMalloc((void **)&d_list, d_list_size*sizeof(hitable *)));
    hitable **d_world;
    checkCudaErrors(cudaMalloc((void **)&d_world, sizeof(hitable *)));
    camera **d_camera;
    checkCudaErrors(cudaMalloc((void **)&d_camera, sizeof(camera *)));
-    create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state);
+    create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state, number_of_balls);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    // Render frame buffer
-    clock_t start = clock();
+    std::cout << "Started rendering\n";
-    {
+    double average_time = 0;
-        dim3 blocks(nx/tx+1, ny/ty+1);
+    for (int i = 0; i < runs; i++) {
-        dim3 threads(tx, ty);
+        clock_t start = clock();
        {
            dim3 blocks(nx/tx+1, ny/ty+1);
            dim3 threads(tx, ty);
-        render_init<<<blocks, threads>>>(nx, ny, d_rand_state);
+            render_init<<<blocks, threads>>>(nx, ny, d_rand_state);
-        render<<<blocks, threads>>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state);
+            render<<<blocks, threads>>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state);
-        checkCudaErrors(cudaGetLastError());
+            checkCudaErrors(cudaGetLastError());
-        checkCudaErrors(cudaDeviceSynchronize());
+            checkCudaErrors(cudaDeviceSynchronize());
        }
        clock_t stop = clock();
        double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
        std::cout << "took " << timer_seconds << " seconds.\n";
        average_time += timer_seconds;
    }
-    clock_t stop = clock();
+    average_time /= runs;
-    double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
+    std::cout << "average time: " << average_time << " seconds.\n";
    std::cout << "took " << timer_seconds << " seconds.\n";
    // Saveing frame buffer
    FILE *f = fopen(image_filename, "w");