diff --git a/README.md b/README.md
index d63a6a1..2e4f07d 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,49 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zichuan Yu
+  * [LinkedIn](https://www.linkedin.com/in/zichuan-yu/), [Behance](https://www.behance.net/zainyu717ebcc)
+* Tested on: Windows 10.0.17134 Build 17134, i7-4710 @ 2.50GHz 16GB, GTX 980m 4096MB GDDR5
 
-### (TODO: Your README)
+## Screenshots
+
+### Static
+
+![result](images/static.png)
+
+### GIF
+
+![result](images/motion.gif)
+
+## Performance Analysis
+
+### Change number of boids, with visualization
+
+![fps_with_visualization](images/FPS%20with%20visualization.png)
+
+### Change number of boids, without visualization
+
+![fps_without_visualization](images/FPS%20without%20visualization.png)
+
+### Fix number of boids, change block size, without visualization
+
+![fps_without_visualization_blocksize](images/FPS%20without%20visualization%20block%20size.png)
+
+## Questions
+
+### For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+Yes. The two advanced methoeds have much better preformance. Because we are saving a lot of unnecessary checkings. The larger the number of boids, the more we save.
+
+### For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+When the block size is small, the performance is not good. When it is larger or equal to 32, the performance is largely improved and doesn't change much as we increase the block size. Not sure why this happens. **Maybe** related to wrap size.
+
+### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+I experience much performance improvements with it. Because when we use scattered memeory method, why are accessing a random memeory place everytime we need to check a neigboring boids. Yet in coherent method, we only do this once when we shuffle the pos and vel.
+
+### Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+When the number of boids becomes large, int 27 cells' case though we check more neighbors, the actuall boids we need to iterate through is actually less, so the performance is better.
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/FPS with visualization.png b/images/FPS with visualization.png
new file mode 100644
index 0000000..27263aa
Binary files /dev/null and b/images/FPS with visualization.png differ
diff --git a/images/FPS without visualization block size.png b/images/FPS without visualization block size.png
new file mode 100644
index 0000000..f447b94
Binary files /dev/null and b/images/FPS without visualization block size.png differ
diff --git a/images/FPS without visualization.png b/images/FPS without visualization.png
new file mode 100644
index 0000000..489c17f
Binary files /dev/null and b/images/FPS without visualization.png differ
diff --git a/images/motion.gif b/images/motion.gif
new file mode 100644
index 0000000..73308b5
Binary files /dev/null and b/images/motion.gif differ
diff --git a/images/static.png b/images/static.png
new file mode 100644
index 0000000..0f49227
Binary files /dev/null and b/images/static.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..dff0113 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..4c14b92 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 1024
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -52,7 +52,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 #define maxSpeed 1.0f
 
 /*! Size of the starting area in simulation space. */
-#define scene_scale 100.0f
+#define scene_scale 100.0f // default 100.0f
 
 /***********************************************
 * Kernel state (pointers are device pointers) *
@@ -67,6 +67,7 @@ dim3 threadsPerBlock(blockSize);
 // boid cares about its neighbors' velocities.
 // These are called ping-pong buffers.
 glm::vec3 *dev_pos;
+glm::vec3 *dev_pos2;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
@@ -77,9 +78,11 @@ glm::vec3 *dev_vel2;
 int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
 int *dev_particleGridIndices; // What grid cell is this particle in?
 // needed for use with thrust
+// 1  to 1
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
+// Length is the number of cells
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
@@ -96,7 +99,7 @@ glm::vec3 gridMinimum;
 
 /******************
 * initSimulation *
-******************/
+******************/   
 
 __host__ __device__ unsigned int hash(unsigned int a) {
   a = (a + 0x7ed55d16) + (a << 12);
@@ -138,6 +141,8 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
 */
 void Boids::initSimulation(int N) {
   numObjects = N;
+  // fullBlocksPerGrid, 1, 1
+  // (5000 + 128 - 1) / 128
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
   // LOOK-1.2 - This is basic CUDA memory management and error checking.
@@ -145,15 +150,18 @@ void Boids::initSimulation(int N) {
   cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
 
+  cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+
   cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
+     
   cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
   kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
+                                                               dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
@@ -168,6 +176,18 @@ void Boids::initSimulation(int N) {
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
   cudaDeviceSynchronize();
 }
@@ -210,8 +230,8 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
   dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+  kernCopyPositionsToVBO <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, vbodptr_positions, scene_scale);
+  kernCopyVelocitiesToVBO <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
   checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
@@ -229,29 +249,89 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
-__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
+__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3* pos, const glm::vec3* vel) {
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 output_vel{ 0.0f };
+  glm::vec3 my_pos{ pos[iSelf] };
+  glm::vec3 perceived_center{ 0.0f };
+  glm::vec3 keep_distance{ 0.0f };
+  glm::vec3 perceived_vel{ 0.0f };
+  int center_N = 0;
+  int vel_N = 0;
+
+
+  for (int i = 0; i < N; ++i) {
+    if (i != iSelf) {
+      glm::vec3 other_pos{ pos[i] };
+      float dis = glm::distance(other_pos, my_pos);
+      // rule 1
+      if (dis < rule1Distance) {
+        perceived_center += other_pos;
+        ++center_N;
+      }
+      
+      // rule 2
+      if (dis < rule2Distance) {
+        keep_distance -= (other_pos - my_pos);
+      }
+
+      // rule 3
+      if (dis < rule3Distance) {
+        perceived_vel += vel[i];
+        ++vel_N;
+      }
+    }
+  }
+
+
+  if (center_N != 0) {
+    perceived_center /= float(center_N);
+    output_vel += (perceived_center - my_pos) * rule1Scale;
+  }
+
+  output_vel += keep_distance * rule2Scale;
+
+  if (vel_N != 0) {
+    perceived_vel /= float(vel_N);
+    output_vel += perceived_vel * rule3Scale;
+  }
+  return output_vel;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
+__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3* pos,
+  glm::vec3* vel1, glm::vec3* vel2) {
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+  // Becasue updating vel1 will affect other thread
+
+  // calc idx
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  // calc v2
+  glm::vec3 new_v = computeVelocityChange(N, idx, pos, vel1) + vel1[idx];
+
+  if (glm::length(new_v) > maxSpeed) {
+    new_v = glm::normalize(new_v) * maxSpeed;
+  }
+
+  vel2[idx] = new_v;
 }
 
 /**
 * LOOK-1.2 Since this is pretty trivial, we implemented it for you.
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
+__global__ void kernUpdatePos(int N, float dt, glm::vec3* pos, glm::vec3* vel) {
   // Update position by velocity
   int index = threadIdx.x + (blockIdx.x * blockDim.x);
   if (index >= N) {
@@ -278,42 +358,127 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //          for(x)
 //            for(y)
 //             for(z)? Or some other order?
+// z->y->x, because x is increamented by 1
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
+  glm::vec3* pos, int* indices, int* gridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+  int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  indices[idx] = idx;
+  // simply use truncation
+  glm::vec3 my_pos = pos[idx];
+  int x = (int)((my_pos.x - gridMin.x) * inverseCellWidth);
+  int y = (int)((my_pos.y - gridMin.y) * inverseCellWidth);
+  int z = (int)((my_pos.z - gridMin.z) * inverseCellWidth);
+  gridIndices[idx] = gridIndex3Dto1D(x, y, z, gridResolution);
+  // DONE
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
-__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
+__global__ void kernResetIntBuffer(int N, int* intBuffer, int value) {
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index < N) {
     intBuffer[index] = value;
   }
 }
 
-__global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
+__global__ void kernIdentifyCellStartEnd(int N, int* particleGridIndices,
+  int* gridCellStartIndices, int* gridCellEndIndices) {
   // TODO-2.1
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  int gridIdx = particleGridIndices[idx];
+  if (idx == 0) {
+    gridCellStartIndices[gridIdx] = 0;
+  } else {
+    if (particleGridIndices[idx] != particleGridIndices[idx - 1]) {
+      gridCellStartIndices[gridIdx] = idx;
+    }
+  }
+
+  if (idx == N - 1) {
+    gridCellStartIndices[gridIdx] = idx;
+  } else {
+    if (particleGridIndices[idx] != particleGridIndices[idx + 1]) {
+      gridCellEndIndices[gridIdx] = idx;
+    }
+  }
+}
+
+// handle one cell
+__device__ void handleOneRange(
+  int cell_id,
+  int* center_N_out,
+  int* vel_N_out,
+  glm::vec3* perceived_center_out,
+  glm::vec3* keep_distance_out,
+  glm::vec3* perceived_vel_out,
+  glm::vec3* vel1,
+  glm::vec3* pos,
+  const int* gridCellStartIndices,
+  const int* gridCellEndIndices,
+  const int* particleArrayIndices,
+  int idx,
+  const glm::vec3* my_pos) {
+  int start_idx = gridCellStartIndices[cell_id];
+  int end_idx = gridCellEndIndices[cell_id];
+  if (start_idx == -1 || end_idx == -1) return;
+  for (int i = start_idx; i <= end_idx; ++i) {
+    int other_idx = particleArrayIndices[i];
+    if (idx != other_idx) {
+      glm::vec3 other_pos = pos[other_idx];
+      float dis = glm::distance(other_pos, *my_pos);
+      // rule 1
+      if (dis < rule1Distance) {
+        *perceived_center_out += other_pos;
+        ++ *center_N_out;
+      }
+
+      // rule 2
+      if (dis < rule2Distance) {
+        *keep_distance_out -= (other_pos - *my_pos);
+      }
+
+      // rule 3
+      if (dis < rule3Distance) {
+        *perceived_vel_out += vel1[i];
+        ++ *vel_N_out;
+      }
+    }
+  }
+}
+
+__device__ int getCellId(int x, int y, int z, int gridResolution) {
+  if (x < 0 || x >= gridResolution
+    || y < 0 || y >= gridResolution
+    || z < 0 || z >= gridResolution) {
+    return -1;
+  }
+  return gridIndex3Dto1D(x, y, z, gridResolution);
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  int* gridCellStartIndices, int* gridCellEndIndices,
+  int* particleArrayIndices,
+  glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
@@ -322,6 +487,104 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  glm::vec3 my_pos{ pos[idx] };
+  // calc cell id
+  float positive_x = my_pos.x - gridMin.x;
+  float positive_y = my_pos.y - gridMin.y;
+  float positive_z = my_pos.z - gridMin.z;
+  int cell_x = (int)(positive_x * inverseCellWidth);
+  int cell_y = (int)(positive_y * inverseCellWidth);
+  int cell_z = (int)(positive_z * inverseCellWidth);
+  // o_ for offset
+  int o_x = 1;
+  int o_y = 1;
+  int o_z = 1;
+  if ((float)(positive_x - cell_x * cellWidth) < cellWidth / (float)2) o_x = -1;
+  if ((float)(positive_y - cell_y * cellWidth) < cellWidth / (float)2) o_y = -1;
+  if ((float)(positive_z - cell_z * cellWidth) < cellWidth / (float)2) o_z = -1;
+
+  int check_cells[8];
+  check_cells[0] = getCellId(cell_x, cell_y, cell_z, gridResolution);
+  check_cells[1] = getCellId(cell_x, cell_y, cell_z + o_z, gridResolution);
+  check_cells[2] = getCellId(cell_x, cell_y + o_y, cell_z, gridResolution);
+  check_cells[3] = getCellId(cell_x, cell_y + o_y, cell_z + o_z, gridResolution);
+  check_cells[4] = getCellId(cell_x + o_x, cell_y, cell_z, gridResolution);
+  check_cells[5] = getCellId(cell_x + o_x, cell_y, cell_z + o_z, gridResolution);
+  check_cells[6] = getCellId(cell_x + o_x, cell_y + o_y, cell_z, gridResolution);
+  check_cells[7] = getCellId(cell_x + o_x, cell_y + o_y, cell_z + o_z, gridResolution);
+
+  glm::vec3 perceived_center{ 0.0f };
+  glm::vec3 keep_distance{ 0.0f };
+  glm::vec3 perceived_vel{ 0.0f };
+  glm::vec3 new_v{ 0.0f };
+  int center_N = 0;
+  int vel_N = 0;
+
+  for (int cell_id : check_cells) {
+    if (cell_id != -1) {
+      int start_idx = gridCellStartIndices[cell_id];
+      int end_idx = gridCellEndIndices[cell_id];
+      // this cell has no boids
+      if (start_idx == -1 || end_idx == -1) continue;
+      for (int i = start_idx; i <= end_idx; ++i) {
+        int other_idx = particleArrayIndices[i];
+        if (idx != other_idx) {
+          glm::vec3 other_pos = pos[other_idx];
+          float dis = glm::distance(other_pos, my_pos);
+          // rule 1
+          if (dis < rule1Distance) {
+            perceived_center += other_pos;
+            ++center_N;
+          }
+          // rule 2
+          if (dis < rule2Distance) {
+            keep_distance -= (other_pos - my_pos);
+          }
+          // rule 3
+          if (dis < rule3Distance) {
+            perceived_vel += vel1[other_idx];
+            ++vel_N;
+          }
+        }
+      }
+    }
+  }
+
+  if (center_N != 0) {
+    perceived_center /= float(center_N);
+    new_v += (perceived_center - my_pos) * rule1Scale;
+  }
+
+  new_v += keep_distance * rule2Scale;
+
+  if (vel_N != 0) {
+    perceived_vel /= float(vel_N);
+    new_v += perceived_vel * rule3Scale;
+  }
+
+  new_v += vel1[idx];
+
+  if (glm::length(new_v) > maxSpeed) {
+    new_v = glm::normalize(new_v) * maxSpeed;
+  }
+
+  vel2[idx] = new_v;
+}
+
+__global__ void kernShuffle(int N, int* particleArrayIndices,
+  glm::vec3* pos1,
+  glm::vec3* pos2,
+  glm::vec3* vel1,
+  glm::vec3* vel2) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= N) return;
+  pos2[idx] = pos1[particleArrayIndices[idx]];
+  vel2[idx] = vel1[particleArrayIndices[idx]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +604,106 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  glm::vec3 my_pos{ pos[idx] };
+  // calc cell id
+
+  float positive_x = my_pos.x - gridMin.x;
+  float positive_y = my_pos.y - gridMin.y;
+  float positive_z = my_pos.z - gridMin.z;
+  int cell_x = (int)(positive_x * inverseCellWidth);
+  int cell_y = (int)(positive_y * inverseCellWidth);
+  int cell_z = (int)(positive_z * inverseCellWidth);
+#if 0
+  // o_ for offset
+  int o_x = 1;
+  int o_y = 1;
+  int o_z = 1;
+  if ((float)(positive_x - cell_x * cellWidth) < cellWidth / (float)2) o_x = -1;
+  if ((float)(positive_y - cell_y * cellWidth) < cellWidth / (float)2) o_y = -1;
+  if ((float)(positive_z - cell_z * cellWidth) < cellWidth / (float)2) o_z = -1;
+
+
+  int check_cells[8];
+  check_cells[0] = getCellId(cell_x, cell_y, cell_z, gridResolution);
+  check_cells[1] = getCellId(cell_x, cell_y, cell_z + o_z, gridResolution);
+  check_cells[2] = getCellId(cell_x, cell_y + o_y, cell_z, gridResolution);
+  check_cells[3] = getCellId(cell_x, cell_y + o_y, cell_z + o_z, gridResolution);
+  check_cells[4] = getCellId(cell_x + o_x, cell_y, cell_z, gridResolution);
+  check_cells[5] = getCellId(cell_x + o_x, cell_y, cell_z + o_z, gridResolution);
+  check_cells[6] = getCellId(cell_x + o_x, cell_y + o_y, cell_z, gridResolution);
+  check_cells[7] = getCellId(cell_x + o_x, cell_y + o_y, cell_z + o_z, gridResolution);
+#else
+  int check_cells[27];
+  int temp_count = 0;
+  for (int i = -1; i <= 1; ++i) {
+    for (int j = -1; j <= 1; ++j) {
+      for (int k = -1; k <= 1; ++k) {
+        check_cells[temp_count] = getCellId(cell_x + i, cell_y + j, cell_z + k, gridResolution);
+        ++temp_count;
+      }
+    }
+  }
+#endif
+  glm::vec3 perceived_center{ 0.0f };
+  glm::vec3 keep_distance{ 0.0f };
+  glm::vec3 perceived_vel{ 0.0f };
+  glm::vec3 new_v{ 0.0f };
+  int center_N = 0;
+  int vel_N = 0;
+
+  for (int cell_id : check_cells) {
+    if (cell_id != -1) {
+      int start_idx = gridCellStartIndices[cell_id];
+      int end_idx = gridCellEndIndices[cell_id];
+      // this cell has no boids
+      if (start_idx == -1 || end_idx == -1) continue;
+      for (int i = start_idx; i <= end_idx; ++i) {
+        int other_idx = i;
+        if (idx != other_idx) {
+          glm::vec3 other_pos = pos[other_idx];
+          float dis = glm::distance(other_pos, my_pos);
+          // rule 1
+          if (dis < rule1Distance) {
+            perceived_center += other_pos;
+            ++center_N;
+          }
+          // rule 2
+          if (dis < rule2Distance) {
+            keep_distance -= (other_pos - my_pos);
+          }
+          // rule 3
+          if (dis < rule3Distance) {
+            perceived_vel += vel1[other_idx];
+            ++vel_N;
+          }
+        }
+      }
+    }
+  }
+
+  if (center_N != 0) {
+    perceived_center /= float(center_N);
+    new_v += (perceived_center - my_pos) * rule1Scale;
+  }
+
+  new_v += keep_distance * rule2Scale;
+
+  if (vel_N != 0) {
+    perceived_vel /= float(vel_N);
+    new_v += perceived_vel * rule3Scale;
+  }
+
+  new_v += vel1[idx];
+
+  if (glm::length(new_v) > maxSpeed) {
+    new_v = glm::normalize(new_v) * maxSpeed;
+  }
+
+  vel2[idx] = new_v;
 }
 
 /**
@@ -349,6 +712,18 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+  // cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+  // checkCUDAErrorWithLine("dev_vel2 to dev_vel1 copy failed!");
+
+  std::swap(dev_vel1, dev_vel2);
+
+  kernUpdatePos<<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects, dt, dev_pos, dev_vel1);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +739,107 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices <<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects,
+                                                              gridSideCount,
+                                                              gridMinimum,
+                                                              gridInverseCellWidth,
+                                                              dev_pos,
+                                                              dev_particleArrayIndices,
+                                                              dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+  // init thrust pointer, and sort
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices,
+                      dev_thrust_particleGridIndices + numObjects,
+                      dev_thrust_particleArrayIndices);
+  checkCUDAErrorWithLine("sort_by_key on dev_thrust_particleGridIndices failed!");
+
+  // check if sorting is succesful, debugging use
+  /*std::unique_ptr<int[]>checkArrayIndices{ new int[numObjects] };
+  std::unique_ptr<int[]>checkGridIndices{ new int[numObjects] };
+  cudaMemcpy(checkArrayIndices.get(), dev_particleArrayIndices,
+             numObjects * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(checkGridIndices.get(), dev_particleGridIndices,
+             numObjects * sizeof(int), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < numObjects; ++i) {
+    std::cout << checkArrayIndices[i] << std::endl;
+  }
+
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+
+  for (int i = 0; i < numObjects; ++i) {
+    std::cout << checkGridIndices[i] << std::endl;
+  }
+
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;*/
+
+  // calc start and end index
+  // before calc, set to -1, other wise emtpy cell will be set to 0
+  kernResetIntBuffer<<<fullBlocksPerGrid, threadsPerBlock>>>(gridCellCount,
+                                                             dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("init dev_gridCellStartIndices to -1 failed!");
+  kernResetIntBuffer<<<fullBlocksPerGrid, threadsPerBlock>>>(gridCellCount,
+                                                             dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("init dev_gridCellEndIndices to -1 failed!");
+  // check if start/end success
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid, threadsPerBlock>>>(numObjects,
+                                                                   dev_particleGridIndices,
+                                                                   dev_gridCellStartIndices,
+                                                                   dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+  // test if calc start/end right
+ /* std::unique_ptr<int[]>check_gridCellStartIndices{ new int[gridCellCount] };
+  std::unique_ptr<int[]>check_gridCellEndIndices{ new int[gridCellCount] };
+  cudaMemcpy(check_gridCellStartIndices.get(), dev_gridCellStartIndices,
+    gridCellCount * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(check_gridCellEndIndices.get(), dev_gridCellEndIndices,
+    gridCellCount * sizeof(int), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < numObjects; ++i) {
+  std::cout << check_gridCellStartIndices[i] << std::endl;
+  }
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+  std::cout << "----------------------------------" << std::endl;
+
+  for (int i = 0; i < numObjects; ++i) {
+  std::cout << check_gridCellEndIndices[i] << std::endl;
+  }
+  int a = 0;*/
+
+  // do real things
+  kernUpdateVelNeighborSearchScattered <<<fullBlocksPerGrid, threadsPerBlock >>> (
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    gridCellWidth,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices,
+    dev_particleArrayIndices,
+    dev_pos,
+    dev_vel1,
+    dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+  cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+  checkCUDAErrorWithLine("dev_vel2 to dev_vel1 copy failed!");
+
+  kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects, dt, dev_pos, dev_vel1);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +858,71 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    dev_pos,
+    dev_particleArrayIndices,
+    dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+  // init thrust pointer, and sort
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices,
+    dev_thrust_particleGridIndices + numObjects,
+    dev_thrust_particleArrayIndices);
+  checkCUDAErrorWithLine("sort_by_key on dev_thrust_particleGridIndices failed!");
+
+  // calc start and end index
+  // before calc, set to -1, other wise emtpy cell will be set to 0
+  kernResetIntBuffer << <fullBlocksPerGrid, threadsPerBlock >> >(gridCellCount,
+    dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("init dev_gridCellStartIndices to -1 failed!");
+  kernResetIntBuffer << <fullBlocksPerGrid, threadsPerBlock >> >(gridCellCount,
+    dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("init dev_gridCellEndIndices to -1 failed!");
+  // check if start/end success
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects,
+    dev_particleGridIndices,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices);
+
+
+  // shuffle
+  kernShuffle <<<fullBlocksPerGrid, threadsPerBlock >>> (numObjects,
+    dev_particleArrayIndices,
+    dev_pos,
+    dev_pos2,
+    dev_vel1,
+    dev_vel2);
+  checkCUDAErrorWithLine("kernShuffle failed!");
+
+  std::swap(dev_pos, dev_pos2);
+  std::swap(dev_vel1, dev_vel2);
+
+  // do real things
+  // HERE
+  kernUpdateVelNeighborSearchCoherent <<<fullBlocksPerGrid, threadsPerBlock >>> (
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    gridCellWidth,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices,
+    dev_pos,
+    dev_vel1,
+    dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
+  std::swap(dev_vel1, dev_vel2);
+
+  kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> >(numObjects, dt, dev_pos, dev_vel1);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
 }
 
 void Boids::endSimulation() {
@@ -389,6 +930,11 @@ void Boids::endSimulation() {
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
 
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
 }
 
@@ -396,10 +942,11 @@ void Boids::unitTest() {
   // LOOK-1.2 Feel free to write additional tests here.
 
   // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
+  int* dev_intKeys;
+  int* dev_intValues;
   int N = 10;
 
+  // TODO check {}
   std::unique_ptr<int[]>intKeys{ new int[N] };
   std::unique_ptr<int[]>intValues{ new int[N] };
 
@@ -420,6 +967,7 @@ void Boids::unitTest() {
   cudaMalloc((void**)&dev_intValues, N * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
 
+  // 1, 1, 1
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
   std::cout << "before unstable sort: " << std::endl;
@@ -436,6 +984,7 @@ void Boids::unitTest() {
   thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
   thrust::device_ptr<int> dev_thrust_values(dev_intValues);
   // LOOK-2.1 Example for using thrust::sort_by_key
+  // This runs in parallel
   thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
 
   // How to copy data back to the CPU side from the GPU
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..2701c96 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,12 +14,12 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
-const float DT = 0.2f;
+const int N_FOR_VIS = 20000; // default 5000
+const float DT = 0.65f; // default 0.2
 
 /**
 * C main function.
@@ -41,7 +41,7 @@ int main(int argc, char* argv[]) {
 //-------------------------------
 
 std::string deviceName;
-GLFWwindow *window;
+GLFWwindow* window;
 
 /**
 * Initialization of CUDA and GLFW.
@@ -63,6 +63,7 @@ bool init(int argc, char **argv) {
   int major = deviceProp.major;
   int minor = deviceProp.minor;
 
+  // just another way to construct string
   std::ostringstream ss;
   ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]";
   deviceName = ss.str();
@@ -120,13 +121,18 @@ bool init(int argc, char **argv) {
   return true;
 }
 
+// Vertex Array Object
 void initVAO() {
-
   std::unique_ptr<GLfloat[]> bodies{ new GLfloat[4 * (N_FOR_VIS)] };
+  // bin indices
   std::unique_ptr<GLuint[]> bindices{ new GLuint[N_FOR_VIS] };
 
   glm::vec4 ul(-1.0, -1.0, 1.0, 1.0);
   glm::vec4 lr(1.0, 1.0, 0.0, 0.0);
+  // UL---UR
+  // |     |
+  // |     |
+  // LL---LR
 
   for (int i = 0; i < N_FOR_VIS; i++) {
     bodies[4 * i + 0] = 0.0f;
@@ -136,10 +142,10 @@ void initVAO() {
     bindices[i] = i;
   }
 
-
   glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this
   glGenBuffers(1, &boidVBO_positions);
   glGenBuffers(1, &boidVBO_velocities);
+  // index buffer object
   glGenBuffers(1, &boidIBO);
 
   glBindVertexArray(boidVAO);
@@ -188,9 +194,9 @@ void initShaders(GLuint * program) {
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
     // use this buffer
 
-    float4 *dptr = NULL;
-    float *dptrVertPositions = NULL;
-    float *dptrVertVelocities = NULL;
+    float4* dptr = NULL;
+    float* dptrVertPositions = NULL;
+    float* dptrVertVelocities = NULL;
 
     cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
     cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);