diff --git a/README.md b/README.md
index ee39093..c593f8e 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,29 @@
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shreyas Singh
+  * [LinkedIn](https://linkedin.com/in/shreyassinghiitr), [personal website](https://github.com/shreyas3156), etc.
+* Tested on: Windows 10, i7-12700 @ 2.1GHz 32GB, T1000 (CETS Lab)
+### Boids Flocking Simulation
 
-### (TODO: Your README)
+This project is a CUDA-based simulation of the Boids Flocking algorithm, and its performance with respect to parameters like number of boids, and number of threads in each block of the GPU.
+
+![](images//UniformGridScattered.gif)
+![](images//NaiveBoid.gif)
+
+### Performance Analysis
+
+* The Naive Neighbor Search algorithm yields the slowest performance that can be explained by the $O(n^2)$ complexity as each boid is evaluated against the other. The Uniform Grid Scattered is more than 5x faster than the Naive Neighbor Search owing to a more refined search space. Moreover, the coherent uniform grid search produces slightly faster results than the scattered version as the boid count was increased beyond 5000, because of the contiguous memory traversal of boid data pointers. These results are consistent across simulations both with and without visualizations.
+![](images/FPSvsBoids_Without_Visualization.png)
+![](images/FPSvsBoids_With_Visualization.png)
+
+* The performance of all three algorithms remain roughly constant as the block size and block count is increased from 32 to 256, mainly because the scheduling is managed by "warps" of 32 threads each. The coherent uniform grid search has the best performance among the three algorithms which is as expected. 
+![](images/FPSvsBlockSize_Without_Visualization.png)
+![](images/FPSvsBlockSize_With_Visualization.png)
+
+* From the plots above, it is clear that the coherent search algorithm infact offers improved performance because of the efficient memory access of position and velocity data for the boids.
+
+* The number of cells being increased to 27 from 8 cells marginally increases the performance until the boid count is increased beyond 20,000. This is because even though the number of cells to be searched is lesser in the latter case, the search space is still large, meaning more number of boids to search.
+
+![](images/FPSvsBoids_Cells.png)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/FPSvsBlockSize_With_Visualization.png b/images/FPSvsBlockSize_With_Visualization.png
new file mode 100644
index 0000000..3ffd2f6
Binary files /dev/null and b/images/FPSvsBlockSize_With_Visualization.png differ
diff --git a/images/FPSvsBlockSize_Without_Visualization.png b/images/FPSvsBlockSize_Without_Visualization.png
new file mode 100644
index 0000000..e359734
Binary files /dev/null and b/images/FPSvsBlockSize_Without_Visualization.png differ
diff --git a/images/FPSvsBoids_Cells.png b/images/FPSvsBoids_Cells.png
new file mode 100644
index 0000000..03527fc
Binary files /dev/null and b/images/FPSvsBoids_Cells.png differ
diff --git a/images/FPSvsBoids_With_Visualization.png b/images/FPSvsBoids_With_Visualization.png
new file mode 100644
index 0000000..ae06db6
Binary files /dev/null and b/images/FPSvsBoids_With_Visualization.png differ
diff --git a/images/FPSvsBoids_Without_Visualization.png b/images/FPSvsBoids_Without_Visualization.png
new file mode 100644
index 0000000..586b3a2
Binary files /dev/null and b/images/FPSvsBoids_Without_Visualization.png differ
diff --git a/images/NaiveBoid.gif b/images/NaiveBoid.gif
new file mode 100644
index 0000000..5f9a17f
Binary files /dev/null and b/images/NaiveBoid.gif differ
diff --git a/images/UniformGridScattered.gif b/images/UniformGridScattered.gif
new file mode 100644
index 0000000..b935a5a
Binary files /dev/null and b/images/UniformGridScattered.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..647cb51 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -54,6 +54,9 @@ void checkCUDAError(const char *msg, int line = -1) {
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
 
+/*! Ration of width of each cell to neighborhood distance. (either 1 or 2) */
+#define CELL_WIDTH_MAX_DIST_RATIO 2
+
 /***********************************************
 * Kernel state (pointers are device pointers) *
 ***********************************************/
@@ -85,6 +88,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_coherentPos;
+glm::vec3 *dev_coherentVel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -157,7 +162,7 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  gridCellWidth = float(CELL_WIDTH_MAX_DIST_RATIO) * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +174,27 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_coherentPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentPos failed!");
+
+  cudaMalloc((void**)&dev_coherentVel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentVel failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +256,51 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+ 
+    glm::vec3 perceived_center = glm::vec3(0.0f);
+    glm::vec3 c = glm::vec3(0.0f);
+    glm::vec3 perceived_velocity = glm::vec3(0.0f);
+
+    int neighbors1 = 0, neighbors3 = 0;
+
+    for (int i = 0; i < N; i++) {
+
+        if (i == iSelf) {
+            continue;
+        }
+
+        float dist = glm::distance(pos[i], pos[iSelf]);
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+        if (dist < rule1Distance) {
+            perceived_center += pos[i];
+            neighbors1++;
+        }
+
+      // Rule 2: boids try to stay a distance d away from each other
+        if (dist < rule2Distance) {
+            c -= pos[i] - pos[iSelf];
+        }
+
+      // Rule 3: boids try to match the speed of surrounding boids
+        if (dist < rule3Distance) {
+            perceived_velocity += vel[i];
+            neighbors3++;
+        }
+    }
+
+    if (neighbors1 > 0)
+        perceived_center /= neighbors1;
+
+    glm::vec3 v1 = (perceived_center - pos[iSelf]) * rule1Scale;
+    glm::vec3 v2 = c * rule2Scale;
+
+    if (neighbors3 > 0)
+        perceived_velocity /= neighbors3;
+
+    glm::vec3 v3 = perceived_velocity * rule3Scale;
+ 
+    return v1 + v2 + v3;
 }
 
 /**
@@ -243,8 +310,21 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 v_new = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+    // Clamp the speed
+    if (glm::length(v_new) > maxSpeed) {
+        v_new = glm::normalize(v_new) * maxSpeed;
+    }
+
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = v_new;
 }
 
 /**
@@ -280,15 +360,24 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //             for(z)? Or some other order?
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
-}
+} 
 
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    indices[index] = index;
+
+    // - Label each boid with the index of its grid cell.
+    glm::ivec3 netPos3D = glm::floor((pos[index] - gridMin) *inverseCellWidth);
+    gridIndices[index] = (gridIndex3Dto1D(netPos3D.x, netPos3D.y, netPos3D.z, gridResolution));
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -303,7 +392,30 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
   // TODO-2.1
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
   // Identify the start point of each cell in the gridIndices array.
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    int currGridIndex = particleGridIndices[index];
+
+    if (index == 0){
+        gridCellStartIndices[currGridIndex] = index;
+        return;
+    }
+
+    if (index == N - 1) {
+        gridCellEndIndices[currGridIndex] = index;
+    }
+
+    int prevGridIndex = particleGridIndices[index - 1];
+
+    if (prevGridIndex != currGridIndex) {
+        gridCellStartIndices[currGridIndex] = index;
+        gridCellEndIndices[currGridIndex - 1] = index - 1;
+    } 
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
 }
@@ -317,70 +429,318 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 currPos = pos[index];
+
+    // - Label each boid with the index of its grid cell.
+    glm::ivec3 gridIdx3D = glm::floor((currPos - gridMin) * inverseCellWidth);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    glm::vec3 cellCenter3D = (glm::vec3(gridIdx3D) + 0.5f) * cellWidth + gridMin;
+    glm::ivec3 startIdx;
+    glm::ivec3 endIdx;
+    if (CELL_WIDTH_MAX_DIST_RATIO == 1) {
+        startIdx = glm::ivec3(-1);
+        endIdx = glm::ivec3(1);
+    }
+    else {
+        glm::bvec3 checkIdx = glm::lessThan(currPos, cellCenter3D);
+        startIdx = -glm::ivec3(checkIdx);
+        endIdx = 1 - glm::ivec3(checkIdx);
+    }
+
+    glm::vec3 perceived_center = glm::vec3(0.0f);
+    glm::vec3 perceived_velocity = glm::vec3(0.0f);
+    glm::vec3 c = glm::vec3(0.0f);
+    int neighbors1 = 0;
+    int neighbors3 = 0;
+
+    // - For each cell, read the start/end indices in the boid pointer array.
+    for (int i = startIdx.x; i <= endIdx.x; i++) {
+        for (int j = startIdx.y; j <= endIdx.y; j++) {
+            for (int k = startIdx.z; k <= endIdx.z; k++) {
+
+                glm::ivec3 currGridIdx = gridIdx3D + glm::ivec3{ i,j,k };
+                glm::ivec3 upperLimit = glm::ivec3(gridResolution - 1);
+
+                if (!(upperLimit.x <= currGridIdx.x <= 0 || upperLimit.y <= currGridIdx.y <= 0 || upperLimit.z <= currGridIdx.z <= 0))
+                    return;
+
+                int gridIdx1D = gridIndex3Dto1D(currGridIdx.x, currGridIdx.y,
+                    currGridIdx.z + k, gridResolution);
+
+                // starting and ending indices of boids
+                int startBoidIdx = gridCellStartIndices[gridIdx1D], endBoidIdx = gridCellEndIndices[gridIdx1D];
+
+                // - Access each boid in the cell and compute velocity change from
+                //   the boids rules, if this boid is within the neighborhood distance.
+                if (startBoidIdx >= 0 && endBoidIdx <= N - 1) {
+                    for (int* p = &particleArrayIndices[startBoidIdx];
+                        p <= &particleArrayIndices[endBoidIdx]; p++) {
+                        int idx = *p;
+
+                        if (idx == index)
+                            continue;
+
+                        float dist = glm::distance(currPos, pos[idx]);
+
+                        // Rule - 1
+
+                        if (dist < rule1Distance) {
+                            perceived_center += pos[idx];
+                            neighbors1++;
+                        }
+
+                        // Rule 2: boids try to stay a distance d away from each other
+                        if (dist < rule2Distance)
+                            c -= (pos[idx] - currPos);
+
+                        // Rule 3: boids try to match the speed of surrounding boids
+                        if (dist < rule3Distance) {
+                            perceived_velocity += vel1[idx];
+                            neighbors3++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (neighbors1 > 0)
+        perceived_center /= neighbors1;
+    glm::vec3 v1 = (perceived_center - currPos) * rule1Scale;
+
+    glm::vec3 v2 = c * rule2Scale;
+
+    if (neighbors3 > 0)
+        perceived_velocity /= neighbors3;
+    glm::vec3 v3 = perceived_velocity * rule3Scale;
+
+    glm::vec3 v_new = vel1[index] + v1 + v2 + v3;
+
+    // - Clamp the speed change before putting the new speed in vel2
+    if (glm::length(v_new) > maxSpeed) {
+        v_new = glm::normalize(v_new) * maxSpeed;
+    }
+    vel2[index] = v_new;
+}
+
+__global__ void kernReshufflePosVel(int N, int* particleArrayIndices, 
+    glm::vec3* pos, glm::vec3* vel, glm::vec3* coherentPos, glm::vec3* coherentVel) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    int arrIndex = particleArrayIndices[index];
+    coherentPos[index] = pos[arrIndex];
+    coherentVel[index] = vel[arrIndex];
 }
 
+
 __global__ void kernUpdateVelNeighborSearchCoherent(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // except with one less level of indirection.
+    // This should expect gridCellStartIndices and gridCellEndIndices to refer
+    // directly to pos and vel1.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    //   DIFFERENCE: For best results, consider what order the cells should be
+    //   checked in to maximize the memory benefits of reordering the boids data.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 currPos = pos[index];
+
+    // - Label each boid with the index of its grid cell.
+    glm::ivec3 gridIdx3D = glm::floor((currPos - gridMin) * inverseCellWidth);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    glm::vec3 cellCenter3D = (glm::vec3(gridIdx3D) + 0.5f) * cellWidth + gridMin;
+    glm::ivec3 startIdx;
+    glm::ivec3 endIdx;
+    if (CELL_WIDTH_MAX_DIST_RATIO == 1) {
+        startIdx = glm::ivec3(-1);
+        endIdx = glm::ivec3(1);
+    }
+    else {
+        glm::bvec3 checkIdx = glm::lessThan(currPos, cellCenter3D);
+        startIdx = -glm::ivec3(checkIdx);
+        endIdx = 1 - glm::ivec3(checkIdx);
+    }
+
+    glm::vec3 perceived_center = glm::vec3(0.0f);
+    glm::vec3 perceived_velocity = glm::vec3(0.0f);
+    glm::vec3 c = glm::vec3(0.0f);
+    int neighbors1 = 0;
+    int neighbors3 = 0;
+
+    // - For each cell, read the start/end indices in the boid pointer array.
+    for (int i = startIdx.x - 1; i <= endIdx.x + 1; i++) {
+        for (int j = startIdx.y - 1; j <= endIdx.y + 1; j++) {
+            for (int k = startIdx.z - 1; k <= endIdx.z + 1; k++) {
+
+                glm::ivec3 currGridIdx = gridIdx3D + glm::ivec3{ i,j,k };
+                glm::ivec3 upperLimit = glm::ivec3(gridResolution - 1);
+
+                if (!(upperLimit.x <= currGridIdx.x <= 0 || upperLimit.y <= currGridIdx.y <= 0 || upperLimit.z <= currGridIdx.z <= 0))
+                    return;
+
+                int gridIdx1D = gridIndex3Dto1D(currGridIdx.x, currGridIdx.y,
+                    currGridIdx.z + k, gridResolution);
+
+                // starting and ending indices of boids
+                int startBoidIdx = gridCellStartIndices[gridIdx1D], endBoidIdx = gridCellEndIndices[gridIdx1D];
+
+                // - Access each boid in the cell and compute velocity change from
+                //   the boids rules, if this boid is within the neighborhood distance.
+                if (startBoidIdx >= 0 && endBoidIdx <= N - 1) {
+                    for (int idx = startBoidIdx;
+                        idx <= endBoidIdx; idx++) {
+
+                        if (idx == index)
+                            continue;
+
+                        float dist = glm::distance(currPos, pos[idx]);
+
+                        // Rule - 1
+
+                        if (dist < rule1Distance) {
+                            perceived_center += pos[idx];
+                            neighbors1++;
+                        }
+
+                        // Rule 2: boids try to stay a distance d away from each other
+                        if (dist < rule2Distance)
+                            c -= (pos[idx] - currPos);
+
+                        // Rule 3: boids try to match the speed of surrounding boids
+                        if (dist < rule3Distance) {
+                            perceived_velocity += vel1[idx];
+                            neighbors3++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (neighbors1 > 0)
+        perceived_center /= neighbors1;
+    glm::vec3 v1 = (perceived_center - currPos) * rule1Scale;
+
+    glm::vec3 v2 = c * rule2Scale;
+
+    if (neighbors3 > 0)
+        perceived_velocity /= neighbors3;
+    glm::vec3 v3 = perceived_velocity * rule3Scale;
+
+    glm::vec3 v_new = vel1[index] + v1 + v2 + v3;
+
+    // - Clamp the speed change before putting the new speed in vel2
+    if (glm::length(v_new) > maxSpeed) {
+        v_new = glm::normalize(v_new) * maxSpeed;
+    }
+    vel2[index] = v_new;
 }
 
+
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
-}
+    
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+    std::swap(dev_vel1, dev_vel2);
+} 
 
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 gridCellThread((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+        gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    
   //   Use 2x width grids.
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+    kernResetIntBuffer << <gridCellThread, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <gridCellThread, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+        dev_gridCellStartIndices, dev_gridCellEndIndices);
+
   // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+        gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+  // 
   // - Update positions
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
   // - Ping-pong buffers as needed
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 gridCellThread((gridCellCount + blockSize - 1) / blockSize);
   // In Parallel:
   // - Label each particle with its array index as well as its grid index.
+  kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+      gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
   //   Use 2x width grids
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernResetIntBuffer << <gridCellThread, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <gridCellThread, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd << <gridCellThread, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
+  kernReshufflePosVel << <gridCellThread, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_coherentPos, dev_coherentVel);
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchCoherent << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+      gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+      dev_coherentPos, dev_coherentVel, dev_vel2);
   // - Update positions
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_coherentPos, dev_vel2);
+  // - Ping-pong buffers as needed
+  std::swap(dev_vel1, dev_vel2);
+  std::swap(dev_coherentPos, dev_pos);
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
 }
 
@@ -390,6 +750,10 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index fe657ed..1f9584c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -21,7 +21,7 @@
 #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**