diff --git a/README.md b/README.md
index 98dd9a8..0ee2f58 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,26 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Carolina Zheng
+* Tested on: Windows 7, i7-6700 @ 3.40GHz 16GB, Quadro K620 (Moore 100 Lab)
 
-### (TODO: Your README)
+### Performance Analysis
+(Sorry, will add screenshot and GIF soon)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+FPS was measured using CUDA timers on the simulation kernels only.
+
+#### Number of boids vs. FPS
+![](images/boids-fps.png)
+
+Changing the number of boids decreased performance for all three simulation types. This was the result I expected, since as the number of boids increases, so does the number of threads and, more importantly, the number of neighbors that need to be checked when velocity is updated.
+
+#### Block size vs. FPS
+![](images/blocksize-fps.png)
+
+Changing the block size didn't have a noticeable impact on performance for all three simulation types. This makes sense to me, since I wouldn't expect any of the kernel operations for this assignment to stall, nor did the kernels I write use an excess amount of memory.
+
+#### Coherent vs. scattered uniform grid
+My coherent grid was slower than the scattered grid. Initially, I was surprised by this, since I thought that eliminating a layer of indirection would be an optimization in terms of performance, but after thinking about it, my coherent grid could have been slower because the execution time for the extra kernel that sorted position and velocity outweighed the benefits of contiguous memory access of those two arrays. Also, there's a possibility that I implemented the nested loops incorrectly and didn't fully achieve contiguous memory access.
+
+#### Varying cell width
+These results are not graphed, but they were interesting. For the scattered grid, decreasing cell width and increasing the number of neighbors checked decreased performance by a factor of about 20%, whereas for the coherent grid, it *increased* performance by about the same factor. My best explanation for these results is that with the increased number of neighbors, the benefits of contiguous memory access for the coherent grid were magnified, whereas the scattered grid did not have this advantage, and it would need to check more entries in the cell start and end index arrays.
diff --git a/images/blocksize-fps.png b/images/blocksize-fps.png
new file mode 100644
index 0000000..7893c26
Binary files /dev/null and b/images/blocksize-fps.png differ
diff --git a/images/boids-fps.png b/images/boids-fps.png
new file mode 100644
index 0000000..d38f0a8
Binary files /dev/null and b/images/boids-fps.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..5ddd3ec 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -17,6 +17,9 @@
 
 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
+// Either 8 or 27 depending on whether cell width = distance or 2 * distance
+#define NUM_NEIGHBORS 8
+
 /**
 * Check for CUDA errors; print and exit if there was a problem.
 */
@@ -85,6 +88,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_sortedPos;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +173,21 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_sortedPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_sortedPos failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -233,7 +252,55 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 thisPos = pos[iSelf];
+  glm::vec3 perceivedCenter;
+  glm::vec3 cohesionVel;
+  glm::vec3 separationVel;
+  glm::vec3 alignmentVel;
+  float rule1NeighborCount = 0.f;
+  float rule3NeighborCount = 0.f;
+  
+  for (int i = 0; i < N; i++)
+  {
+    if (i == iSelf)
+    {
+      continue;
+    }
+
+    glm::vec3 otherPos = pos[i];
+    float distance = glm::distance(thisPos, otherPos);
+
+    if (distance < rule1Distance)
+    {
+      perceivedCenter += otherPos;
+      rule1NeighborCount += 1.0;
+    }
+
+    if (distance < rule2Distance)
+    {
+      separationVel -= otherPos - thisPos;
+    }
+
+    if (distance < rule3Distance)
+    {
+      alignmentVel += vel[i];
+      rule3NeighborCount += 1.0;
+    }
+  }
+
+  if (rule1NeighborCount > 0)
+  {
+    cohesionVel = (perceivedCenter / rule1NeighborCount - thisPos) * rule1Scale;
+  }
+
+  separationVel *= rule2Scale;
+
+  if (rule3NeighborCount > 0)
+  {
+    alignmentVel = alignmentVel / rule3NeighborCount * rule3Scale;
+  }
+
+  return cohesionVel + separationVel + alignmentVel;
 }
 
 /**
@@ -245,6 +312,21 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (index >= N)
+  {
+    return;
+  }
+
+  glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+  if (glm::length(newVel) > maxSpeed)
+  {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
 }
 
 /**
@@ -279,6 +361,11 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //            for(y)
 //             for(z)? Or some other order?
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
+  if (x < 0 || x >= gridResolution || y < 0 || y >= gridResolution || z < 0 || z >= gridResolution)
+  {
+    return -1;
+  }
+
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
@@ -289,6 +376,18 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index >= N)
+  {
+    return;
+  }
+
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 gridIndex3D = glm::floor((thisPos - gridMin) * inverseCellWidth);
+
+  gridIndices[index] = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
+  indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +405,45 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index >= N)
+  {
+    return;
+  }
+
+  int gridIndex = particleGridIndices[index];
+
+  if (index == 0)
+  {
+    gridCellStartIndices[gridIndex] = index;
+    gridCellEndIndices[gridIndex] = index;
+  }
+  else
+  {
+    int prevGridIndex = particleGridIndices[index - 1];
+
+    if (prevGridIndex != gridIndex)
+    {
+      gridCellStartIndices[gridIndex] = index;
+      gridCellEndIndices[prevGridIndex] = index - 1;
+    }
+  }
+}
+
+__global__ void kernSortPosAndVel(int N, int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *sortedPos, glm::vec3 *vel, glm::vec3 *sortedVel)
+{
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index >= N)
+  {
+    return;
+  }
+
+  int sortedIndex = particleArrayIndices[index];
+
+  sortedPos[index] = pos[sortedIndex];
+  sortedVel[index] = vel[sortedIndex];
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +460,111 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index >= N)
+  {
+    return;
+  }
+
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 perceivedCenter;
+  glm::vec3 cohesionVel;
+  glm::vec3 separationVel;
+  glm::vec3 alignmentVel;
+  float rule1NeighborCount = 0.f;
+  float rule3NeighborCount = 0.f;
+
+  int neighborIndices[NUM_NEIGHBORS];
+  int currentNeighborIndex = 0;
+  glm::vec3 gridIndex3D = glm::floor((thisPos - gridMin) * inverseCellWidth);
+  glm::vec3 gridCenter = (gridIndex3D - gridResolution * 0.5f) * cellWidth + (cellWidth * 0.5f);
+  glm::vec3 displacement = glm::sign(thisPos - gridCenter);
+
+  int startValue = 0;
+  if (NUM_NEIGHBORS == 27)
+  {
+    startValue = -1;
+    displacement = glm::vec3(1);
+  }
+
+  for (int z = startValue; z < 2; z++)
+  {
+    for (int y = startValue; y < 2; y++)
+    {
+      for (int x = startValue; x < 2; x++)
+      {
+        neighborIndices[currentNeighborIndex++] = gridIndex3Dto1D(gridIndex3D.x + x * displacement.x, gridIndex3D.y + y * displacement.y, gridIndex3D.z + z * displacement.z, gridResolution);
+      }
+    }
+  }
+
+  for (int neighborCubeIndex = 0; neighborCubeIndex < currentNeighborIndex; neighborCubeIndex++)
+  {
+    int neighborGridIndex = neighborIndices[neighborCubeIndex];
+
+    // An invalid 3D grid index was given
+    if (neighborGridIndex == -1)
+    {
+      continue;
+    }
+
+    int startIndex = gridCellStartIndices[neighborGridIndex];
+    int endIndex = gridCellEndIndices[neighborGridIndex];
+
+    for (int otherGridArrayIndex = startIndex; otherGridArrayIndex <= endIndex; otherGridArrayIndex++)
+    {
+      int otherParticleArrayIndex = particleArrayIndices[otherGridArrayIndex];
+
+      if (otherParticleArrayIndex == index)
+      {
+        continue;
+      }
+
+      glm::vec3 otherPos = pos[otherParticleArrayIndex];
+      glm::vec3 otherVel = vel1[otherParticleArrayIndex];
+
+      float distance = glm::distance(thisPos, otherPos);
+
+      if (distance < rule1Distance)
+      {
+        perceivedCenter += otherPos;
+        rule1NeighborCount += 1.0;
+      }
+
+      if (distance < rule2Distance)
+      {
+        separationVel -= otherPos - thisPos;
+      }
+
+      if (distance < rule3Distance)
+      {
+        alignmentVel += vel1[otherParticleArrayIndex];
+        rule3NeighborCount += 1.0;
+      }
+    }
+  }
+
+  if (rule1NeighborCount > 0)
+  {
+    cohesionVel = (perceivedCenter / rule1NeighborCount - thisPos) * rule1Scale;
+  }
+
+  separationVel *= rule2Scale;
+
+  if (rule3NeighborCount > 0)
+  {
+    alignmentVel = alignmentVel / rule3NeighborCount * rule3Scale;
+  }
+
+  glm::vec3 newVel = vel1[index] + cohesionVel + separationVel + alignmentVel;
+
+  if (glm::length(newVel) > maxSpeed)
+  {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +584,109 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index >= N)
+  {
+    return;
+  }
+
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 perceivedCenter;
+  glm::vec3 cohesionVel;
+  glm::vec3 separationVel;
+  glm::vec3 alignmentVel;
+  float rule1NeighborCount = 0.f;
+  float rule3NeighborCount = 0.f;
+
+  int neighborIndices[NUM_NEIGHBORS];
+  int currentNeighborIndex = 0;
+  glm::vec3 gridIndex3D = glm::floor((thisPos - gridMin) * inverseCellWidth);
+  glm::vec3 gridCenter = (gridIndex3D - gridResolution * 0.5f) * cellWidth + (cellWidth * 0.5f);
+  glm::vec3 displacement = glm::sign(thisPos - gridCenter);
+
+  int startValue = 0;
+  if (NUM_NEIGHBORS == 27)
+  {
+    startValue = -1;
+    displacement = glm::vec3(1);
+  }
+
+  for (int z = startValue; z < 2; z++)
+  {
+    for (int y = startValue; y < 2; y++)
+    {
+      for (int x = startValue; x < 2; x++)
+      {
+        neighborIndices[currentNeighborIndex++] = gridIndex3Dto1D(gridIndex3D.x + x * displacement.x, gridIndex3D.y + y * displacement.y, gridIndex3D.z + z * displacement.z, gridResolution);
+      }
+    }
+  }
+
+  for (int neighborCubeIndex = 0; neighborCubeIndex < currentNeighborIndex; neighborCubeIndex++)
+  {
+    int neighborGridIndex = neighborIndices[neighborCubeIndex];
+
+    // An invalid 3D grid index was given
+    if (neighborGridIndex == -1)
+    {
+      continue;
+    }
+
+    int startIndex = gridCellStartIndices[neighborGridIndex];
+    int endIndex = gridCellEndIndices[neighborGridIndex];
+
+    for (int otherGridArrayIndex = startIndex; otherGridArrayIndex <= endIndex; otherGridArrayIndex++)
+    {
+      if (otherGridArrayIndex == index)
+      {
+        continue;
+      }
+
+      glm::vec3 otherPos = pos[otherGridArrayIndex];
+      glm::vec3 otherVel = vel1[otherGridArrayIndex];
+
+      float distance = glm::distance(thisPos, otherPos);
+
+      if (distance < rule1Distance)
+      {
+        perceivedCenter += otherPos;
+        rule1NeighborCount += 1.0;
+      }
+
+      if (distance < rule2Distance)
+      {
+        separationVel -= otherPos - thisPos;
+      }
+
+      if (distance < rule3Distance)
+      {
+        alignmentVel += vel1[otherGridArrayIndex];
+        rule3NeighborCount += 1.0;
+      }
+    }
+  }
+
+  if (rule1NeighborCount > 0)
+  {
+    cohesionVel = (perceivedCenter / rule1NeighborCount - thisPos) * rule1Scale;
+  }
+
+  separationVel *= rule2Scale;
+
+  if (rule3NeighborCount > 0)
+  {
+    alignmentVel = alignmentVel / rule3NeighborCount * rule3Scale;
+  }
+
+  glm::vec3 newVel = vel1[index] + cohesionVel + separationVel + alignmentVel;
+
+  if (glm::length(newVel) > maxSpeed)
+  {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
 }
 
 /**
@@ -349,6 +695,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernUpdateVelocityBruteForce << <fullBlocksPerGrid, threadsPerBlock>> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+  kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_pos, dev_vel2);
+
+  glm::vec3 *temp = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = temp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +718,27 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+  dim3 particleBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  dim3 cellBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize);
+
+  kernComputeIndices<<<particleBlocksPerGrid, threadsPerBlock>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+  thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+  thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+  kernResetIntBuffer << <cellBlocksPerGrid, threadsPerBlock >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <cellBlocksPerGrid, threadsPerBlock >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd<<<particleBlocksPerGrid, threadsPerBlock>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernUpdateVelNeighborSearchScattered << <particleBlocksPerGrid, threadsPerBlock >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+    dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+  kernUpdatePos << <particleBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_pos, dev_vel2);
+
+  glm::vec3 *temp = dev_pos;
+  dev_pos = dev_sortedPos;
+  dev_sortedPos = temp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +757,30 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  dim3 particleBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  dim3 cellBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize);
+
+  kernComputeIndices << <particleBlocksPerGrid, threadsPerBlock >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+  thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+  thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+  kernResetIntBuffer << <cellBlocksPerGrid, threadsPerBlock >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <cellBlocksPerGrid, threadsPerBlock >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd << <particleBlocksPerGrid, threadsPerBlock >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernSortPosAndVel << <particleBlocksPerGrid, threadsPerBlock >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_sortedPos, dev_vel1, dev_vel2);
+
+  kernUpdateVelNeighborSearchCoherent << <particleBlocksPerGrid, threadsPerBlock >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+    dev_gridCellStartIndices, dev_gridCellEndIndices, dev_sortedPos, dev_vel2, dev_vel1);
+
+  kernUpdatePos << <particleBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_sortedPos, dev_vel1);
+
+  glm::vec3 *temp = dev_pos;
+  dev_pos = dev_sortedPos;
+  dev_sortedPos = temp;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +789,11 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_sortedPos);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index a29471d..f210dc3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,9 +13,9 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
@@ -186,7 +186,7 @@ void initShaders(GLuint * program) {
   //====================================
   // Main loop
   //====================================
-  void runCUDA() {
+  void runCUDA(double &timeElapsed) {
     // Map OpenGL buffer object for writing from CUDA on a single GPU
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
     // use this buffer
@@ -195,9 +195,15 @@ void initShaders(GLuint * program) {
     float *dptrVertPositions = NULL;
     float *dptrVertVelocities = NULL;
 
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    float ms;
+
     cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
     cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
 
+    cudaEventRecord(start);
     // execute the kernel
     #if UNIFORM_GRID && COHERENT_GRID
     Boids::stepSimulationCoherentGrid(DT);
@@ -206,6 +212,11 @@ void initShaders(GLuint * program) {
     #else
     Boids::stepSimulationNaive(DT);
     #endif
+    cudaEventRecord(stop);
+
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&ms, start, stop);
+    timeElapsed += ms;
 
     #if VISUALIZE
     Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
@@ -219,6 +230,8 @@ void initShaders(GLuint * program) {
     double fps = 0;
     double timebase = 0;
     int frame = 0;
+    double timeElapsed = 0;
+    float framesElapsed = 0.f;
 
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
@@ -227,6 +240,7 @@ void initShaders(GLuint * program) {
       glfwPollEvents();
 
       frame++;
+      framesElapsed += 1.f;
       double time = glfwGetTime();
 
       if (time - timebase > 1.0) {
@@ -235,7 +249,15 @@ void initShaders(GLuint * program) {
         frame = 0;
       }
 
-      runCUDA();
+      runCUDA(timeElapsed);
+
+      if (timeElapsed >= 1000)
+      {
+        std::cout << "fps: ";
+        std::cout << framesElapsed / (timeElapsed / 1000) << std::endl;
+        framesElapsed = 0;
+        timeElapsed = 0;
+      }
 
       std::ostringstream ss;
       ss << "[";