From cb5ab7349d0f484fa77c63c8d5418db990a1a632 Mon Sep 17 00:00:00 2001 From: Rokas Puzonas Date: Fri, 1 Dec 2023 02:20:33 +0200 Subject: [PATCH] do some benchmarking --- README.md | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.cu | 77 +++++++++++++++++++++----------- 2 files changed, 175 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index fcb1633..6588c4a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,126 @@ # Ray Tracing in One Weekend (CUDA edition) Tutorial: https://github.com/rogerallen/raytracinginoneweekendincuda + +GPU Used: NVIDIA GeForce GTX 1660 Super +CUDA Cores: 1408 + +## Parameter sets + +1: +* Resolution: 300x300 +* Thread block size: 8x8 +* Samples per pixel: 8 +* Number of additional objects: 0 +* Maximum number of bounces: 50 +* Time to render: 0.0085s + +--- + +2: +* Resolution: 600x600 +* Thread block size: 4x4 +* Samples per pixel: 50 +* Number of additional objects: 200 +* Maximum number of bounces: 50 +* Time to render: 2.12857s + +3: +* Resolution: 600x600 +* Thread block size: 16x16 +* Samples per pixel: 50 +* Number of additional objects: 200 +* Maximum number of bounces: 50 +* Time to render: 2.4912s + +4: +* Resolution: 600x600 +* Thread block size: 24x24 +* Samples per pixel: 50 +* Number of additional objects: 200 +* Maximum number of bounces: 50 +* Time to render: 1.81483s + +--- + +5: +* Resolution: 1000x1000 +* Thread block size: 24x24 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 14.7335s + +6: +* Resolution: 1000x1000 +* Thread block size: 26x26 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 17.4273s + +7: +* Resolution: 1000x1000 +* Thread block size: 4x4 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 19.0303s + +8: +* Resolution: 1000x1000 +* Thread block size: 16x16 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 23.2295s + +9: +* Resolution: 1000x1000 +* Thread block size: 20x20 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 23.6136s + +10: +* Resolution: 1000x1000 +* Thread block size: 12x12 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 27.5289s + +10: +* Resolution: 1000x1000 +* Thread block size: 29x29 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 28.6761s + +11: +* Resolution: 1000x1000 +* Thread block size: 8x8 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 33.2001s + +12: +* Resolution: 1000x1000 +* Thread block size: 2x2 +* Samples per pixel: 100 +* Number of additional objects: 300 +* Maximum number of bounces: 50 +* Time to render: 55.2398s + +--- + +13: +* Resolution: 2400x1200 +* Thread block size: 16x16 +* Samples per pixel: 100 +* Number of additional objects: 500 +* Maximum number of bounces: 50 +* Time to render: 124.928s diff --git a/src/main.cu b/src/main.cu index 15da3a1..d03ef21 100644 --- a/src/main.cu +++ b/src/main.cu @@ -12,6 +12,8 @@ #include "camera.cpp" #include "hitable_list.cpp" +#define MAX_BOUNCES 50 + #define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__) void check_cuda(cudaError_t result, const char *func, const char *file, int line) { @@ -35,7 +37,7 @@ __device__ bool hit_sphere(const vec3& center, float radius, const ray& r) { __device__ vec3 color(const ray& r, hitable **world, curandState *local_rand_state) { ray cur_ray = r; vec3 cur_attenuation = vec3(1,1,1); - for (int i = 0; i < 50; i++) { + for (int i = 0; i < MAX_BOUNCES; i++) { hit_record rec; if ((*world)->hit(cur_ray, 0.001f, FLT_MAX, rec)) { ray scattered; @@ -90,7 +92,7 @@ __global__ void render(vec3 *fb, ray r = (*cam)->get_ray(u,v, &local_rand_state); col += color(r, world, &local_rand_state); } - rand_state[pixel_idx] = local_rand_state; + // rand_state[pixel_idx] = local_rand_state; col /= float(ns); col[0] = sqrt(col[0]); @@ -99,7 +101,7 @@ __global__ void render(vec3 *fb, fb[pixel_idx] = col; } -__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state) { +__global__ void create_world(hitable **d_list, int d_list_size, hitable **d_world, camera **d_camera, int nx, int ny, curandState *rand_state, int ball_count) { if (!(threadIdx.x == 0 && blockIdx.y == 0)) return; curandState local_rand_state = *rand_state; @@ -108,10 +110,24 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl #define RND() curand_uniform(&local_rand_state) int idx = 1; - for (int a = -11; a < 11; a++) { - for (int b = -11; b < 11; b++) { + // for (int a = -11; a < 11; a++) { + // for (int b = -11; b < 11; b++) { + // float choose_mat = RND(); + // vec3 center(a + RND(), 0.2, b + RND()); + // material *mat; + // if (choose_mat < 0.8f) { + // mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND())); + // } else if (choose_mat < 0.95f) { + // mat = new metal(vec3(0.5f*(1.0f+RND()), 0.5f*(1.0f+RND()), 0.5f*(1.0f+RND())), 0.5f*RND()); + // } else { + // mat = new dielectric(1.5); + // } + // d_list[idx++] = new sphere(center, 0.2, mat); + // } + // } + for (int i = 0; i < ball_count; i++) { float choose_mat = RND(); - vec3 center(a + RND(), 0.2, b + RND()); + vec3 center(RND()*22.0f-11.0f, 0.2f, RND()*22.0f-11.0f); material *mat; if (choose_mat < 0.8f) { mat = new lambertian(vec3(RND()*RND(), RND()*RND(), RND()*RND())); @@ -121,7 +137,6 @@ __global__ void create_world(hitable **d_list, int d_list_size, hitable **d_worl mat = new dielectric(1.5); } d_list[idx++] = new sphere(center, 0.2, mat); - } } #undef RND @@ -160,11 +175,14 @@ __global__ void free_world(hitable **d_list, int d_list_size, hitable **d_world, int main() { const char *image_filename = "out.ppm"; - int nx = 1200; - int ny = 600; - int ns = 20; - int tx = 16; - int ty = 16; + int nx = 1000; + int ny = 1000; + int ns = 100; + int tx = 24; + int ty = 24; + int number_of_balls = 300; + + int runs = 5; std::cout << "Rendering a " << nx << "x" << ny << " image "; std::cout << "in " << tx << "x" << ty << " blocks.\n"; @@ -183,37 +201,46 @@ int main() { checkCudaErrors(cudaMalloc((void **)&d_rand_world_state, num_pixels*sizeof(curandState))); // Initialize world rand state + std::cout << "Initializing world random\n"; rand_init<<<1,1>>>(d_rand_world_state); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaDeviceSynchronize()); // populate world + std::cout << "Populating world\n"; hitable **d_list; - int d_list_size = 22*22 + 1 + 3; + int d_list_size = number_of_balls + 1 + 3; checkCudaErrors(cudaMalloc((void **)&d_list, d_list_size*sizeof(hitable *))); hitable **d_world; checkCudaErrors(cudaMalloc((void **)&d_world, sizeof(hitable *))); camera **d_camera; checkCudaErrors(cudaMalloc((void **)&d_camera, sizeof(camera *))); - create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state); + create_world<<<1,1>>>(d_list, d_list_size, d_world, d_camera, nx, ny, d_rand_world_state, number_of_balls); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaDeviceSynchronize()); // Render frame buffer - clock_t start = clock(); - { - dim3 blocks(nx/tx+1, ny/ty+1); - dim3 threads(tx, ty); + std::cout << "Started rendering\n"; + double average_time = 0; + for (int i = 0; i < runs; i++) { + clock_t start = clock(); + { + dim3 blocks(nx/tx+1, ny/ty+1); + dim3 threads(tx, ty); - render_init<<>>(nx, ny, d_rand_state); + render_init<<>>(nx, ny, d_rand_state); - render<<>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state); - checkCudaErrors(cudaGetLastError()); - checkCudaErrors(cudaDeviceSynchronize()); + render<<>>(fb, nx, ny, ns, d_camera, d_world, d_rand_state); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); + } + clock_t stop = clock(); + double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC; + std::cout << "took " << timer_seconds << " seconds.\n"; + average_time += timer_seconds; } - clock_t stop = clock(); - double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC; - std::cout << "took " << timer_seconds << " seconds.\n"; + average_time /= runs; + std::cout << "average time: " << average_time << " seconds.\n"; // Saveing frame buffer FILE *f = fopen(image_filename, "w");