diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 5229a64..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-project(raytracing-in-a-weekend)
-
-set(CMAKE_CXX_STANDARD 14)
-enable_language(CUDA)
-
-add_executable(main src/main.cu)
-set_target_properties(main PROPERTIES CUDA_ARCHITECTURES all-major)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2d5508a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,41 @@
+CUDA_PATH     ?= /usr/local/cuda
+HOST_COMPILER  = g++
+NVCC           = $(CUDA_PATH)/bin/nvcc
+
+# select one of these for Debug vs. Release
+NVCC_DBG       = -g -G
+#NVCC_DBG       =
+
+NVCCFLAGS      = $(NVCC_DBG) -m64
+GENCODE_FLAGS  = --gpu-architecture=compute_50 --gpu-code=compute_50,sm_50,sm_52
+
+BUILD_DIR = build
+SRCS = src/main.cu
+INCS = src/vec3.cpp src/ray.cpp
+
+$(BUILD_DIR)/cudart: $(BUILD_DIR)/cudart.o
+	$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) -o $(BUILD_DIR)/cudart $(BUILD_DIR)/cudart.o
+
+$(BUILD_DIR)/cudart.o: $(SRCS) $(INCS)
+	mkdir -p $(BUILD_DIR)
+	$(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) -o $(BUILD_DIR)/cudart.o -c src/main.cu
+
+out.ppm: $(BUILD_DIR)/cudart
+	rm -f out.ppm
+	./$(BUILD_DIR)/cudart
+
+out.jpg: out.ppm
+	rm -f out.jpg
+	ppmtojpeg out.ppm > out.jpg
+
+profile_basic: $(BUILD_DIR)/cudart
+	nvprof ./$(BUILD_DIR)/cudart
+
+
+# use nvprof --query-metrics
+profile_metrics: $(BUILD_DIR)/cudart
+	nvprof --metrics achieved_occupancy,inst_executed,inst_fp_32,inst_fp_64,inst_integer ./cudart
+
+clean:
+	rm -rf $(BUILD_DIR)
+	rm -f out.ppm out.jpg
diff --git a/run.sh b/run.sh
index c200fdb..d38753d 100755
--- a/run.sh
+++ b/run.sh
@@ -1,6 +1,2 @@
 #!/bin/sh
-mkdir -p build
-cd build
-cmake ..
-make
-./main
+make out.jpg
diff --git a/src/main.cu b/src/main.cu
index c232729..ca8162e 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -3,80 +3,95 @@
 #include <iostream>
 #include <time.h>
 
+#include "vec3.cpp"
+#include "ray.cpp"
+
 #define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
 
 void check_cuda(cudaError_t result, const char *func, const char *file, int line) {
-	if (result) {
-                std::cout << "CUDA error: " << cudaGetErrorString(result) << " (error code " << static_cast<unsigned int>(result) << ")";
-		std::cout << " at " << file << ":" << line << " '" << func << "' \n";
-		cudaDeviceReset();
-		exit(-1);
-	}
+    if (result) {
+        std::cout << "CUDA error: " << cudaGetErrorString(result) << " (error code " << static_cast<unsigned int>(result) << ")";
+        std::cout << " at " << file << ":" << line << " '" << func << "' \n";
+        cudaDeviceReset();
+        exit(-1);
+    }
 }
 
-__global__ void render(float *fb, int max_x, int max_y) {
+__device__ vec3 color(const ray& r) {
+    vec3 unit_direction = unit_vector(r.direction());
+    float t = 0.5f * (unit_direction.y() + 1.0f);
+    return (1.0f - t)*vec3(1.0,1.0,1.0) + t*vec3(1.0, 0.0, 0.0);
+}
+
+__global__ void render(vec3 *fb, int max_x, int max_y, vec3 lower_left_corner, vec3 horizontal, vec3 vertical, vec3 origin) {
     int x = threadIdx.x + blockIdx.x * blockDim.x;
     int y = threadIdx.y + blockIdx.y * blockDim.y;
     if ((x >= max_x) || (y >= max_y)) return;
-    int pixel_idx = 3*(y*max_x + x);
+    int pixel_idx = y*max_x + x;
+    float u = float(x) / max_x;
+    float v = float(y) / max_y;
 
-    fb[pixel_idx + 0] = float(x) / max_x;
-    fb[pixel_idx + 1] = float(y) / max_y;
-    fb[pixel_idx + 2] = 0.2;
+    ray r(origin, lower_left_corner + u*horizontal + v*vertical);
+    fb[pixel_idx] = color(r);
 }
 
 int main() {
-	const char *image_filename = "image.ppm";
-	int nx = 1200;
-	int ny = 600;
-	int tx = 8;
-	int ty = 8;
+    const char *image_filename = "out.ppm";
+    int nx = 1200;
+    int ny = 600;
+    int tx = 8;
+    int ty = 8;
 
-	int num_pixels = nx*ny;
-	size_t fb_size = 3*num_pixels*sizeof(float);
+    std::cout << "Rendering a " << nx << "x" << ny << " image ";
+    std::cout << "in " << tx << "x" << ty << " blocks.\n";
 
-	float *fb;
-	checkCudaErrors(cudaMallocManaged(&fb, fb_size));
+    int num_pixels = nx*ny;
+    size_t fb_size = num_pixels*sizeof(vec3);
 
-	clock_t start = clock();
-	{
-		dim3 blocks(nx/tx+1, ny/ty+1);
-		dim3 threads(tx, ty);
+    vec3 *fb;
+    checkCudaErrors(cudaMallocManaged(&fb, fb_size));
 
-		render<<<blocks, threads>>>(fb, nx, ny);
-                checkCudaErrors(cudaGetLastError());
-                checkCudaErrors(cudaDeviceSynchronize());
-	}
-	clock_t stop = clock();
-	double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
-	std::cout << "took " << timer_seconds << " seconds.\n";
+    clock_t start = clock();
+    {
+        dim3 blocks(nx/tx+1, ny/ty+1);
+        dim3 threads(tx, ty);
 
-	FILE *f = fopen(image_filename, "w");
-	assert(f);
+        render<<<blocks, threads>>>(fb, nx, ny,
+                                    vec3(-2.0, -1.0, -1.0),
+                                    vec3(4.0, 0.0, 0.0),
+                                    vec3(0.0, 2.0, 0.0),
+                                    vec3(0.0, 0.0, 0.0));
+        checkCudaErrors(cudaGetLastError());
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+    clock_t stop = clock();
+    double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
+    std::cout << "took " << timer_seconds << " seconds.\n";
 
-	char header[128] = { 0 };
-	int header_size = snprintf(header, sizeof(header), "P3\n%d %d\n255\n", nx, ny);
-	fwrite(header, header_size, 1, f);
+    FILE *f = fopen(image_filename, "w");
+    assert(f);
 
-	for (int y=ny-1; y >= 0; y--) {
-		for (int x=0; x < nx; x++) {
-			size_t pixel_idx = 3*(y*nx + x);
-			float r = fb[pixel_idx + 0];
-			float g = fb[pixel_idx + 1];
-			float b = fb[pixel_idx + 2];
-			int ir = int(255.99*r);
-			int ig = int(255.99*g);
-			int ib = int(255.99*b);
+    char header[128] = { 0 };
+    int header_size = snprintf(header, sizeof(header), "P3\n%d %d\n255\n", nx, ny);
+    fwrite(header, header_size, 1, f);
 
-			char pixel_str[128] = { 0 };
-			int str_size = snprintf(pixel_str, sizeof(pixel_str), "%d %d %d\n", ir, ig, ib);
-			fwrite(pixel_str, str_size, 1, f);
-		}
-	}
+    for (int y=ny-1; y >= 0; y--) {
+        for (int x=0; x < nx; x++) {
+            size_t pixel_idx = (y*nx + x);
+            vec3 pixel = fb[pixel_idx];
+            int ir = int(255.99*pixel.r());
+            int ig = int(255.99*pixel.g());
+            int ib = int(255.99*pixel.b());
 
-	fclose(f);
+            char pixel_str[128] = { 0 };
+            int str_size = snprintf(pixel_str, sizeof(pixel_str), "%d %d %d\n", ir, ig, ib);
+            fwrite(pixel_str, str_size, 1, f);
+        }
+    }
 
-	checkCudaErrors(cudaFree(fb));
+    fclose(f);
 
-	return 0;
+    checkCudaErrors(cudaFree(fb));
+
+    return 0;
 }
diff --git a/src/ray.cpp b/src/ray.cpp
new file mode 100644
index 0000000..d3464a0
--- /dev/null
+++ b/src/ray.cpp
@@ -0,0 +1,14 @@
+#include "vec3.cpp"
+
+class ray
+{
+    public:
+        __device__ ray() {}
+        __device__ ray(const vec3& a, const vec3& b) { A = a; B = b; }
+        __device__ vec3 origin() const       { return A; }
+        __device__ vec3 direction() const    { return B; }
+        __device__ vec3 point_at_parameter(float t) const { return A + t*B; }
+
+        vec3 A;
+        vec3 B;
+};
diff --git a/src/vec3.cpp b/src/vec3.cpp
new file mode 100644
index 0000000..840af54
--- /dev/null
+++ b/src/vec3.cpp
@@ -0,0 +1,142 @@
+#pragma once
+
+#include "cuda_runtime.h"
+#include <math.h>
+#include <stdlib.h>
+#include <iostream>
+
+class vec3  {
+
+public:
+    __host__ __device__ vec3() {}
+    __host__ __device__ vec3(float e0, float e1, float e2) { e[0] = e0; e[1] = e1; e[2] = e2; }
+    __host__ __device__ inline float x() const { return e[0]; }
+    __host__ __device__ inline float y() const { return e[1]; }
+    __host__ __device__ inline float z() const { return e[2]; }
+    __host__ __device__ inline float r() const { return e[0]; }
+    __host__ __device__ inline float g() const { return e[1]; }
+    __host__ __device__ inline float b() const { return e[2]; }
+
+    __host__ __device__ inline const vec3& operator+() const { return *this; }
+    __host__ __device__ inline vec3 operator-() const { return vec3(-e[0], -e[1], -e[2]); }
+    __host__ __device__ inline float operator[](int i) const { return e[i]; }
+    __host__ __device__ inline float& operator[](int i) { return e[i]; };
+
+    __host__ __device__ inline vec3& operator+=(const vec3 &v2);
+    __host__ __device__ inline vec3& operator-=(const vec3 &v2);
+    __host__ __device__ inline vec3& operator*=(const vec3 &v2);
+    __host__ __device__ inline vec3& operator/=(const vec3 &v2);
+    __host__ __device__ inline vec3& operator*=(const float t);
+    __host__ __device__ inline vec3& operator/=(const float t);
+
+    __host__ __device__ inline float length() const { return sqrt(e[0]*e[0] + e[1]*e[1] + e[2]*e[2]); }
+    __host__ __device__ inline float squared_length() const { return e[0]*e[0] + e[1]*e[1] + e[2]*e[2]; }
+    __host__ __device__ inline void make_unit_vector();
+
+
+    float e[3];
+};
+
+
+
+inline std::istream& operator>>(std::istream &is, vec3 &t) {
+    is >> t.e[0] >> t.e[1] >> t.e[2];
+    return is;
+}
+
+inline std::ostream& operator<<(std::ostream &os, const vec3 &t) {
+    os << t.e[0] << " " << t.e[1] << " " << t.e[2];
+    return os;
+}
+
+__host__ __device__ inline void vec3::make_unit_vector() {
+    float k = 1.0 / sqrt(e[0]*e[0] + e[1]*e[1] + e[2]*e[2]);
+    e[0] *= k; e[1] *= k; e[2] *= k;
+}
+
+__host__ __device__ inline vec3 operator+(const vec3 &v1, const vec3 &v2) {
+    return vec3(v1.e[0] + v2.e[0], v1.e[1] + v2.e[1], v1.e[2] + v2.e[2]);
+}
+
+__host__ __device__ inline vec3 operator-(const vec3 &v1, const vec3 &v2) {
+    return vec3(v1.e[0] - v2.e[0], v1.e[1] - v2.e[1], v1.e[2] - v2.e[2]);
+}
+
+__host__ __device__ inline vec3 operator*(const vec3 &v1, const vec3 &v2) {
+    return vec3(v1.e[0] * v2.e[0], v1.e[1] * v2.e[1], v1.e[2] * v2.e[2]);
+}
+
+__host__ __device__ inline vec3 operator/(const vec3 &v1, const vec3 &v2) {
+    return vec3(v1.e[0] / v2.e[0], v1.e[1] / v2.e[1], v1.e[2] / v2.e[2]);
+}
+
+__host__ __device__ inline vec3 operator*(float t, const vec3 &v) {
+    return vec3(t*v.e[0], t*v.e[1], t*v.e[2]);
+}
+
+__host__ __device__ inline vec3 operator/(vec3 v, float t) {
+    return vec3(v.e[0]/t, v.e[1]/t, v.e[2]/t);
+}
+
+__host__ __device__ inline vec3 operator*(const vec3 &v, float t) {
+    return vec3(t*v.e[0], t*v.e[1], t*v.e[2]);
+}
+
+__host__ __device__ inline float dot(const vec3 &v1, const vec3 &v2) {
+    return v1.e[0] *v2.e[0] + v1.e[1] *v2.e[1]  + v1.e[2] *v2.e[2];
+}
+
+__host__ __device__ inline vec3 cross(const vec3 &v1, const vec3 &v2) {
+    return vec3( (v1.e[1]*v2.e[2] - v1.e[2]*v2.e[1]),
+                (-(v1.e[0]*v2.e[2] - v1.e[2]*v2.e[0])),
+                (v1.e[0]*v2.e[1] - v1.e[1]*v2.e[0]));
+}
+
+
+__host__ __device__ inline vec3& vec3::operator+=(const vec3 &v){
+    e[0]  += v.e[0];
+    e[1]  += v.e[1];
+    e[2]  += v.e[2];
+    return *this;
+}
+
+__host__ __device__ inline vec3& vec3::operator*=(const vec3 &v){
+    e[0]  *= v.e[0];
+    e[1]  *= v.e[1];
+    e[2]  *= v.e[2];
+    return *this;
+}
+
+__host__ __device__ inline vec3& vec3::operator/=(const vec3 &v){
+    e[0]  /= v.e[0];
+    e[1]  /= v.e[1];
+    e[2]  /= v.e[2];
+    return *this;
+}
+
+__host__ __device__ inline vec3& vec3::operator-=(const vec3& v) {
+    e[0]  -= v.e[0];
+    e[1]  -= v.e[1];
+    e[2]  -= v.e[2];
+    return *this;
+}
+
+__host__ __device__ inline vec3& vec3::operator*=(const float t) {
+    e[0]  *= t;
+    e[1]  *= t;
+    e[2]  *= t;
+    return *this;
+}
+
+__host__ __device__ inline vec3& vec3::operator/=(const float t) {
+    float k = 1.0/t;
+
+    e[0]  *= k;
+    e[1]  *= k;
+    e[2]  *= k;
+    return *this;
+}
+
+__host__ __device__ inline vec3 unit_vector(vec3 v) {
+    return v / v.length();
+}