diff --git a/CMakeLists.txt b/CMakeLists.txt
index dff84f8..c3c3e2c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,10 +97,10 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
     ${CORELIBS}
     )
 
-add_custom_command(
-    TARGET ${CMAKE_PROJECT_NAME}
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-        ${CMAKE_SOURCE_DIR}/shaders
-        ${CMAKE_BINARY_DIR}/shaders
-    )
+#add_custom_command(
+#    TARGET ${CMAKE_PROJECT_NAME}
+#    POST_BUILD
+#    COMMAND ${CMAKE_COMMAND} -E copy_directory
+#        ${CMAKE_SOURCE_DIR}/shaders
+#        ${CMAKE_BINARY_DIR}/shaders
+#    )
diff --git a/README.md b/README.md
index 41b91f0..94bdfc0 100644
--- a/README.md
+++ b/README.md
@@ -5,17 +5,46 @@ CUDA Rasterizer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-	* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+* Yan Wu
+	* [LinkedIn](https://www.linkedin.com/in/yan-wu-a71270159/)
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GTX 1060 6GB (Personal Laptop)
+
+### Brief Introduction
+In this project, I used CUDA to implement a simplified rasterized graphics pipeline, similar to the OpenGL pipeline. I implemented vertex shading, primitive assembly, rasterization, fragment shading, and a framebuffer for the basic pipeline. And there are also some extra features.
+
+* [What is rasterization?](https://en.wikipedia.org/wiki/Rasterisation)
+* Basic pipeline
+  * Clear the fragment buffer with some default value.
+  * Vertex shading
+  * Primitive assembly
+  * Rasterization
+  * Fragments to depth buffer
+  * Fragment shading
+  * Fragment to framebuffer writing
+* SSAA implemented
+* UV texture mapping with bilinear texture filtering and perspective correct texture coordinates
+
+### Results
+
+* Results for basic Pipelines<br />
+  <img src="/images/basic pipeline/triangle.gif" width="30%">
+  <img src="/images/basic pipeline/box.gif" width="30%">
+  <img src="/images/basic pipeline/duck.gif" width="30%">
+  
+* After implementing SSAA <br />
+  <img src="/images/SSAA/truck.gif" width="45%">
+  <img src="/images/SSAA/truck2.gif" width="45%"> <br />
+  - Clearly, after implementing SSAA, the aliasing problem decreases. If you look carefully at the outline of the truck, the left one is more tooth-shaped.
+
+* Implementing UV texture mapping with [bilinear texture filtering](https://en.wikipedia.org/wiki/Bilinear_filtering#Sample_code) and [perspective correct texture coordinates](https://www.scratchapixel.com/lessons/3d-basic-rendering/rasterization-practical-implementation/perspective-correct-interpolation-vertex-attributes) <br />
+  <img src="/images/texture mapping/board.gif" width="45%">
+  <img src="/images/texture mapping/board1.gif" width="45%"> <br />
+  - After implementing these two features, we can see the difference. With the left original result, the perspective is weird and the texture resolution is not as good as the right one. The right result looks much more comfortable.
+  
 
 
 ### Credits
 
+* [ScreenToGif(used in this project)](https://www.screentogif.com/)
 * [tinygltfloader](https://github.com/syoyo/tinygltfloader) by [@soyoyo](https://github.com/syoyo)
 * [glTF Sample Models](https://github.com/KhronosGroup/glTF/blob/master/sampleModels/README.md)
diff --git a/images/SSAA/truck.gif b/images/SSAA/truck.gif
new file mode 100644
index 0000000..d8e02e8
Binary files /dev/null and b/images/SSAA/truck.gif differ
diff --git a/images/SSAA/truck2.gif b/images/SSAA/truck2.gif
new file mode 100644
index 0000000..4fc95df
Binary files /dev/null and b/images/SSAA/truck2.gif differ
diff --git a/images/basic pipeline/box.gif b/images/basic pipeline/box.gif
new file mode 100644
index 0000000..c2a7d76
Binary files /dev/null and b/images/basic pipeline/box.gif differ
diff --git a/images/basic pipeline/duck.gif b/images/basic pipeline/duck.gif
new file mode 100644
index 0000000..8aa8001
Binary files /dev/null and b/images/basic pipeline/duck.gif differ
diff --git a/images/basic pipeline/triangle.gif b/images/basic pipeline/triangle.gif
new file mode 100644
index 0000000..f2f227f
Binary files /dev/null and b/images/basic pipeline/triangle.gif differ
diff --git a/images/texture mapping/board.gif b/images/texture mapping/board.gif
new file mode 100644
index 0000000..0fd3ba1
Binary files /dev/null and b/images/texture mapping/board.gif differ
diff --git a/images/texture mapping/board1.gif b/images/texture mapping/board1.gif
new file mode 100644
index 0000000..253b8a0
Binary files /dev/null and b/images/texture mapping/board1.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..00edee0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..21f111a 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -1,10 +1,10 @@
 /**
- * @file      rasterize.cu
- * @brief     CUDA-accelerated rasterization pipeline.
- * @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
- * @date      2012-2016
- * @copyright University of Pennsylvania & STUDENT
- */
+* @file      rasterize.cu
+* @brief     CUDA-accelerated rasterization pipeline.
+* @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
+* @date      2012-2016
+* @copyright University of Pennsylvania & STUDENT
+*/
 
 #include <cmath>
 #include <cstdio>
@@ -17,6 +17,11 @@
 #include "rasterize.h"
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
+#include <device_launch_parameters.h>
+
+#define SSAA 0
+#define BILINEAR_TEXTURE 0
+#define PERSPECTIVE 0
 
 namespace {
 
@@ -28,7 +33,7 @@ namespace {
 
 	typedef unsigned char BufferByte;
 
-	enum PrimitiveType{
+	enum PrimitiveType {
 		Point = 1,
 		Line = 2,
 		Triangle = 3
@@ -41,12 +46,12 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		 glm::vec3 eyePos;	// eye space position used for shading
-		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
-		 glm::vec2 texcoord0;
-		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
+		glm::vec3 eyePos;	// eye space position used for shading
+		glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
+							// glm::vec3 col;
+		glm::vec2 texcoord0;
+		TextureData* dev_diffuseTex = NULL;
+		int texWidth, texHeight;
 		// ...
 	};
 
@@ -62,10 +67,13 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
+		glm::vec3 eyePos;	// eye space position used for shading
+		glm::vec3 eyeNor;
+		VertexAttributeTexcoord texcoord0;
+		TextureData* dev_diffuseTex = NULL;
+
+		int width, height;
+		float depth;
 		// ...
 	};
 
@@ -111,58 +119,127 @@ static glm::vec3 *dev_framebuffer = NULL;
 
 static int * dev_depth = NULL;	// you might need this buffer when doing depth test
 
-/**
- * Kernel that writes the image to the OpenGL PBO directly.
- */
-__global__ 
+								/**
+								* Kernel that writes the image to the OpenGL PBO directly.
+								*/
+__global__
 void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
+
+	if (x < w && y < h) {
+		glm::vec3 color;
+#if SSAA
+		for (int i = 0; i < SSAA; i++) {
+			for (int j = 0; j < SSAA; j++) {
+				int x1 = SSAA * x + i;
+				int y1 = SSAA * y + j;
+				int w1 = SSAA * w;
+				int index1 = x1 + y1 * w1;
+				color.x += glm::clamp(image[index1].x, 0.0f, 1.0f) * 255.0;
+				color.y += glm::clamp(image[index1].y, 0.0f, 1.0f) * 255.0;
+				color.z += glm::clamp(image[index1].z, 0.0f, 1.0f) * 255.0;
+			}
+		}
+		color /= (float)SSAA * SSAA;
+#else
+		color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
+		color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
+		color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
+#endif
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
-/** 
+/**
 * Writes fragment colors to the framebuffer
 */
 __global__
 void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
 
-    if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
+	if (x < w && y < h) {
+		framebuffer[index] = fragmentBuffer[index].color;
 
 		// TODO: add your fragment shader code here
-
-    }
+		glm::vec3 diffuse = glm::vec3(0.f);
+		Fragment fragment_Buffer = fragmentBuffer[index];
+		float fragment_width = fragment_Buffer.width;
+		float fragment_height = fragment_Buffer.height;
+		glm::vec3 light_pos = glm::vec3(10.f, 10.f, 10.f);
+		glm::vec3 light_dir = glm::normalize(light_pos - fragment_Buffer.eyePos);
+		if (fragment_Buffer.dev_diffuseTex != NULL) {
+			TextureData *texture_data = fragment_Buffer.dev_diffuseTex;
+#if	BILINEAR_TEXTURE
+			float u = fragment_Buffer.texcoord0.x * fragment_width;
+			float v = fragment_Buffer.texcoord0.y * fragment_height;
+			int x = (int)u;
+			int y = (int)v;
+			float u_ratio = u - x;
+			float v_ratio = v - y;
+			float u_opposite = 1.f - u_ratio;
+			float v_opposite = 1.f - v_ratio;
+
+			int pos0 = x + y*fragment_width;
+			int pos1 = (x + 1) + y*fragment_width;
+			int pos2 = x + (y + 1)*fragment_width;
+			int pos3 = (x + 1) + (y + 1)*fragment_width;
+
+			float result1 = v_opposite * (texture_data[3 * pos0] * u_opposite + texture_data[3 * pos1] * u_ratio)
+				+ v_ratio * (texture_data[3 * pos2] * u_opposite + texture_data[3 * pos3] * u_ratio);
+
+			float result2 = v_opposite * (texture_data[3 * pos0 + 1] * u_opposite + texture_data[3 * pos1 + 1] * u_ratio)
+				+ v_ratio * (texture_data[3 * pos2 + 1] * u_opposite + texture_data[3 * pos3 + 1] * u_ratio);
+
+			float result3 = v_opposite * (texture_data[3 * pos0 + 2] * u_opposite + texture_data[3 * pos1 + 2] * u_ratio)
+				+ v_ratio * (texture_data[3 * pos2 + 2] * u_opposite + texture_data[3 * pos3 + 2] * u_ratio);
+
+			diffuse = glm::vec3(result1, result2, result3) / 255.f;
+#else
+			int x1 = fragment_Buffer.texcoord0.x * fragment_width;
+			int y1 = fragment_Buffer.texcoord0.y * fragment_height;
+			int index1 = x1 + y1*fragment_width;
+			diffuse = glm::vec3(texture_data[3 * index1] / 255.f,
+				texture_data[3 * index1 + 1] / 255.f,
+				texture_data[3 * index1 + 2] / 255.f);
+#endif
+		}
+		else {
+			diffuse = fragment_Buffer.color;
+		}
+		float cos_theta = glm::dot(fragment_Buffer.eyeNor, light_dir);
+		framebuffer[index] = diffuse * cos_theta;
+		// BLINNPHONG
+		glm::vec3 H = glm::normalize(light_dir - fragment_Buffer.eyePos);
+		float specular = pow(glm::max(glm::dot(fragment_Buffer.eyeNor, H), 0.f), 100.f);
+		framebuffer[index] += specular * glm::vec3(1.f, 1.f, 1.f);
+	}
 }
 
 /**
- * Called once at the beginning of the program to allocate memory.
- */
+* Called once at the beginning of the program to allocate memory.
+*/
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
+	width = w;
+	height = h;
+#if SSAA
+	width *= SSAA;
+	height *= SSAA;
+#endif
 	cudaFree(dev_fragmentBuffer);
 	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-    cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
+	cudaFree(dev_framebuffer);
+	cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
 	cudaFree(dev_depth);
 	cudaMalloc(&dev_depth, width * height * sizeof(int));
 
@@ -187,9 +264,9 @@ void initDepth(int w, int h, int * depth)
 * kern function with support for stride to sometimes replace cudaMemcpy
 * One thread is responsible for copying one component
 */
-__global__ 
+__global__
 void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) {
-	
+
 	// Attribute (vec3 position)
 	// component (3 * float)
 	// byte (4 * byte)
@@ -202,20 +279,20 @@ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, in
 		int offset = i - count * n;	// which component of the attribute
 
 		for (int j = 0; j < componentTypeByteSize; j++) {
-			
-			dev_dst[count * componentTypeByteSize * n 
-				+ offset * componentTypeByteSize 
+
+			dev_dst[count * componentTypeByteSize * n
+				+ offset * componentTypeByteSize
 				+ j]
 
-				= 
+				=
 
-			dev_src[byteOffset 
-				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) 
-				+ offset * componentTypeByteSize 
+				dev_src[byteOffset
+				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
+				+ offset * componentTypeByteSize
 				+ j];
 		}
 	}
-	
+
 
 }
 
@@ -235,7 +312,7 @@ void _nodeMatrixTransform(
 }
 
 glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
-	
+
 	glm::mat4 curMatrix(1.0);
 
 	const std::vector<double> &m = n.matrix;
@@ -247,7 +324,8 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
 				curMatrix[i][j] = (float)m.at(4 * i + j);
 			}
 		}
-	} else {
+	}
+	else {
 		// no matrix, use rotation, scale, translation
 
 		if (n.translation.size() > 0) {
@@ -275,12 +353,12 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
 	return curMatrix;
 }
 
-void traverseNode (
+void traverseNode(
 	std::map<std::string, glm::mat4> & n2m,
 	const tinygltf::Scene & scene,
 	const std::string & nodeString,
 	const glm::mat4 & parentMatrix
-	) 
+)
 {
 	const tinygltf::Node & n = scene.nodes.at(nodeString);
 	glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
@@ -537,7 +615,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 									size_t s = image.image.size() * sizeof(TextureData);
 									cudaMalloc(&dev_diffuseTex, s);
 									cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
-									
+
 									diffuseTexWidth = image.width;
 									diffuseTexHeight = image.height;
 
@@ -554,7 +632,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 					// ---------Node hierarchy transform--------
 					cudaDeviceSynchronize();
-					
+
 					dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 					_nodeMatrixTransform << <numBlocksNodeTransform, numThreadsPerBlock >> > (
 						numVertices,
@@ -595,21 +673,21 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 		} // for each node
 
 	}
-	
+
 
 	// 3. Malloc for dev_primitives
 	{
 		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
 	}
-	
+
 
 	// Finally, cudaFree raw dev_bufferViews
 	{
 
 		std::map<std::string, BufferByte*>::const_iterator it(bufferViewDevPointers.begin());
 		std::map<std::string, BufferByte*>::const_iterator itEnd(bufferViewDevPointers.end());
-			
-			//bufferViewDevPointers
+
+		//bufferViewDevPointers
 
 		for (; it != itEnd; it++) {
 			cudaFree(it->second);
@@ -623,11 +701,11 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 
 
-__global__ 
+__global__
 void _vertexTransformAndAssembly(
-	int numVertices, 
-	PrimitiveDevBufPointers primitive, 
-	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
+	int numVertices,
+	PrimitiveDevBufPointers primitive,
+	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal,
 	int width, int height) {
 
 	// vertex id
@@ -638,10 +716,25 @@ void _vertexTransformAndAssembly(
 		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
 		// Then divide the pos by its w element to transform into NDC space
 		// Finally transform x and y to viewport space
+		glm::vec4 pos = glm::vec4(primitive.dev_position[vid], 1.f);
+		pos = MVP * pos;
+		glm::vec4 eye_pos = MV * pos;
+
+		pos /= pos.w;
+		pos.x = 0.5f * width * (pos.x + 1.f);
+		pos.y = 0.5f * height * (1.f - pos.y);
 
 		// TODO: Apply vertex assembly here
 		// Assemble all attribute arraies into the primitive array
-		
+		primitive.dev_verticesOut[vid].eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]);
+		primitive.dev_verticesOut[vid].eyePos = glm::vec3(eye_pos);
+		primitive.dev_verticesOut[vid].pos = pos;
+		if (primitive.dev_diffuseTex != NULL) {
+			primitive.dev_verticesOut[vid].dev_diffuseTex = primitive.dev_diffuseTex;
+			primitive.dev_verticesOut[vid].texcoord0 = primitive.dev_texcoord0[vid];
+			primitive.dev_verticesOut[vid].texWidth = primitive.diffuseTexWidth;
+			primitive.dev_verticesOut[vid].texHeight = primitive.diffuseTexHeight;
+		}
 	}
 }
 
@@ -649,7 +742,7 @@ void _vertexTransformAndAssembly(
 
 static int curPrimitiveBeginId = 0;
 
-__global__ 
+__global__
 void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) {
 
 	// index id
@@ -660,28 +753,101 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+				= primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
 
 
 		// TODO: other primitive types (point, line)
 	}
-	
+
+}
+
+template <class T>
+__host__ __device__ static
+T Interpolate(glm::vec3 bc, T v0, T v1, T v2)
+{
+	return bc.x*v0 + bc.y*v1 + bc.z*v2;
 }
 
+__global__ void rasterization(int num, Primitive *primitives, Fragment *fragments, int width, int height, int *depths) {
+	int pid = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (pid < num) {
+		Primitive primitive = primitives[pid];
+		glm::vec3 triangle_vertices[3] = {
+			glm::vec3(primitive.v[0].pos),
+			glm::vec3(primitive.v[1].pos),
+			glm::vec3(primitive.v[2].pos)
+		};
+		glm::vec3 eye_pos[3] = {
+			glm::vec3(primitive.v[0].eyePos),
+			glm::vec3(primitive.v[1].eyePos),
+			glm::vec3(primitive.v[2].eyePos)
+		};
+
+		AABB aabb = getAABBForTriangle(triangle_vertices);
+		for (int x = aabb.min.x; x <= aabb.max.x; x++) {
+			for (int y = aabb.min.y; y <= aabb.max.y; y++) {
+				glm::vec3 bary_centric = calculateBarycentricCoordinate(triangle_vertices, glm::vec2(x, y));
+				if (isBarycentricCoordInBounds(bary_centric)) {
+					int index = x + y * width;
+					float depth = getZAtCoordinate(bary_centric, triangle_vertices);
+					int depth1 = INT_MAX * depth;
+					atomicMin(&depths[index], depth1);
+					if (depth1 == depths[index]) {
+						fragments[index].depth = fabs(depth);
+						fragments[index].eyeNor = bary_centric.x * primitive.v[0].eyeNor +
+							bary_centric.y * primitive.v[1].eyeNor +
+							bary_centric.z * primitive.v[2].eyeNor;
+						fragments[index].eyePos = bary_centric.x * primitive.v[0].eyePos +
+							bary_centric.y * primitive.v[1].eyePos +
+							bary_centric.z * primitive.v[2].eyePos;
+#if PERSPECTIVE
+						glm::vec3 w = {
+							bary_centric.x / eye_pos[0].z,
+							bary_centric.y / eye_pos[1].z,
+							bary_centric.z / eye_pos[2].z,
+							};
+						float a = w.x * primitive.v[0].texcoord0.x +
+							w.y * primitive.v[1].texcoord0.x +
+							w.z * primitive.v[2].texcoord0.x;
+
+						float b = w.x * primitive.v[0].texcoord0.y +
+							w.y * primitive.v[1].texcoord0.y +
+							w.z * primitive.v[2].texcoord0.y;
+
+						float c = 1.f / (w.x + w.y + w.z);
+
+						a *= c;
+						b *= c;
+						fragments[index].texcoord0 = glm::vec2(a, b);
+#else
+						fragments[index].texcoord0 = bary_centric.x * primitive.v[0].texcoord0 +
+							bary_centric.y * primitive.v[1].texcoord0 +
+							bary_centric.z * primitive.v[2].texcoord0;
+#endif
+						fragments[index].dev_diffuseTex = primitive.v[0].dev_diffuseTex;
+						fragments[index].height = primitive.v[0].texHeight;
+						fragments[index].width = primitive.v[0].texWidth;
+						fragments[index].color = glm::vec3(1.f, 1.f, 1.f);
+					}
+				}
+			}
+		}
+	}
+}
 
 
 /**
- * Perform rasterization.
- */
+* Perform rasterization.
+*/
 void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
+	int sideLength2d = 8;
+	dim3 blockSize2d(sideLength2d, sideLength2d);
+	dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
 		(height - 1) / blockSize2d.y + 1);
 
 	// Execute your rasterization pipeline here
@@ -706,10 +872,10 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
 				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
-					(p->numIndices, 
-					curPrimitiveBeginId, 
-					dev_primitives, 
-					*p);
+					(p->numIndices,
+						curPrimitiveBeginId,
+						dev_primitives,
+						*p);
 				checkCUDAError("Primitive Assembly");
 
 				curPrimitiveBeginId += p->numPrimitives;
@@ -718,28 +884,34 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 
 		checkCUDAError("Vertex Processing and Primitive Assembly");
 	}
-	
+
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
 	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
 
+	// TODO: rasterize
+	dim3 blockSize(128);
+	dim3 blockNum((totalNumPrimitives + blockSize.x - 1) / blockSize.x);
+	rasterization << <blockNum, blockSize >> > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, width, height, dev_depth);
 
 
-    // Copy depthbuffer colors into framebuffer
+	// Copy depthbuffer colors into framebuffer
 	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
 	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
+	// Copy framebuffer into OpenGL buffer for OpenGL previewing
+#if SSAA
+	sendImageToPBO << <blockCount2d, blockSize2d >> > (pbo, width / SSAA, height / SSAA, dev_framebuffer);
+#else
+	sendImageToPBO << <blockCount2d, blockSize2d >> >(pbo, width, height, dev_framebuffer);
+#endif
+	checkCUDAError("copy render result to pbo");
 }
 
 /**
- * Called once at the end of the program to free CUDA memory.
- */
+* Called once at the end of the program to free CUDA memory.
+*/
 void rasterizeFree() {
 
-    // deconstruct primitives attribute/indices device buffer
+	// deconstruct primitives attribute/indices device buffer
 
 	auto it(mesh2PrimitivesMap.begin());
 	auto itEnd(mesh2PrimitivesMap.end());
@@ -753,24 +925,24 @@ void rasterizeFree() {
 
 			cudaFree(p->dev_verticesOut);
 
-			
+
 			//TODO: release other attributes and materials
 		}
 	}
 
 	////////////
 
-    cudaFree(dev_primitives);
-    dev_primitives = NULL;
+	cudaFree(dev_primitives);
+	dev_primitives = NULL;
 
 	cudaFree(dev_fragmentBuffer);
 	dev_fragmentBuffer = NULL;
 
-    cudaFree(dev_framebuffer);
-    dev_framebuffer = NULL;
+	cudaFree(dev_framebuffer);
+	dev_framebuffer = NULL;
 
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
-    checkCUDAError("rasterize Free");
+	checkCUDAError("rasterize Free");
 }
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index c995fae..084fb39 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -8,5 +8,5 @@ set(SOURCE_FILES
 
 cuda_add_library(util
    ${SOURCE_FILES}
-   OPTIONS -arch=sm_52
+   OPTIONS -arch=sm_61
    )