diff --git a/README.md b/README.md index d63a6a1..aa6fe0b 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,25 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Eric Chiu +* Tested on: Windows 10 Education, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.60GHz 32GB, NVIDIA GeForce GTX 1070 (SIGLAB) -### (TODO: Your README) +## Result -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](./images/Boids.gif) + +## Performance Analysis + +![](./images/Boids-FPS-With-Visualization.png) + +![](./images/Boids-FPS-Without-Visualization.png) + +As we can see from the graphs, when the number of boids increases, the frame rate and performance generally decreases for all implementations: naive, scattered, and coherent. This is probably because it is more likely to have more boids within a neighborhood distance. This means that we have to iterate over more boids in order to calculate the next position of a single boid. + +There was performance improvements from the scattered uniform grid to the coherent uniform grid, but only by a small percentage (15% to 20%). I expected the outcome to be at least twice as fast. After thinking about it a little more, I realized that cutting out the middleman does make accessing data faster, but there are still roughly the same number of operations needed to calculate the next position of a single boid. Because of this, it made sense that performance was only by improved by 15% to 20%. + +![](./images/Block-FPS-Without-Visualization.png) + +When the block size increases from 16 to 32, the frame rate and performance is improved for all implementations: naive, scattered, and coherent. After we increase the block size further to 64, 128, and so forth, the performance generally is barely affected. I suspect the reason for this is because the warp size is set to 32. + +Changing cell width and checking 27 vs 8 neighboring cells decreased the performance for all implementations. I suspect the reason for this is because the more neighboring cells are checked, the more neighboring boids must be checked to see if it is within a boid's neighborhood distance. \ No newline at end of file diff --git a/images/Block-FPS-Without-Visualization.png b/images/Block-FPS-Without-Visualization.png new file mode 100644 index 0000000..3ab6b7f Binary files /dev/null and b/images/Block-FPS-Without-Visualization.png differ diff --git a/images/Boids-FPS-With-Visualization.png b/images/Boids-FPS-With-Visualization.png new file mode 100644 index 0000000..89128ca Binary files /dev/null and b/images/Boids-FPS-With-Visualization.png differ diff --git a/images/Boids-FPS-Without-Visualization.png b/images/Boids-FPS-Without-Visualization.png new file mode 100644 index 0000000..595b065 Binary files /dev/null and b/images/Boids-FPS-Without-Visualization.png differ diff --git a/images/Boids.gif b/images/Boids.gif new file mode 100644 index 0000000..dda244d Binary files /dev/null and b/images/Boids.gif differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdd636d..b737097 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,5 +10,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..b24470b 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -21,14 +21,14 @@ * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAError(const char *msg, int line = -1) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - if (line >= 0) { - fprintf(stderr, "Line %d: ", line); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + if (line >= 0) { + fprintf(stderr, "Line %d: ", line); + } + fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } - fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } } @@ -85,6 +85,8 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3 *coh_pos; +glm::vec3 *coh_vel; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -99,13 +101,13 @@ glm::vec3 gridMinimum; ******************/ __host__ __device__ unsigned int hash(unsigned int a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; } /** @@ -113,10 +115,10 @@ __host__ __device__ unsigned int hash(unsigned int a) { * Function for generating a random vec3. */ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { - thrust::default_random_engine rng(hash((int)(index * time))); - thrust::uniform_real_distribution unitDistrib(-1, 1); + thrust::default_random_engine rng(hash((int)(index * time))); + thrust::uniform_real_distribution unitDistrib(-1, 1); - return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); + return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); } /** @@ -124,52 +126,75 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { * CUDA kernel for generating boids with a specified mass randomly around the star. */ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - glm::vec3 rand = generateRandomVec3(time, index); - arr[index].x = scale * rand.x; - arr[index].y = scale * rand.y; - arr[index].z = scale * rand.z; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + glm::vec3 rand = generateRandomVec3(time, index); + arr[index].x = scale * rand.x; + arr[index].y = scale * rand.y; + arr[index].z = scale * rand.z; + } } /** * Initialize memory, update some globals */ void Boids::initSimulation(int N) { - numObjects = N; - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - // LOOK-1.2 - This is basic CUDA memory management and error checking. - // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); - - cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); - - cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); - - // LOOK-1.2 - This is a typical CUDA kernel invocation. - kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); - checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); - - // LOOK-2.1 computing grid params - gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); - int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; - gridSideCount = 2 * halfSideCount; - - gridCellCount = gridSideCount * gridSideCount * gridSideCount; - gridInverseCellWidth = 1.0f / gridCellWidth; - float halfGridWidth = gridCellWidth * halfSideCount; - gridMinimum.x -= halfGridWidth; - gridMinimum.y -= halfGridWidth; - gridMinimum.z -= halfGridWidth; - - // TODO-2.1 TODO-2.3 - Allocate additional buffers here. - cudaDeviceSynchronize(); + numObjects = N; + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // LOOK-1.2 - This is basic CUDA memory management and error checking. + // Don't forget to cudaFree in Boids::endSimulation. + cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); + + cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); + + cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); + + // LOOK-1.2 - This is a typical CUDA kernel invocation. + kernGenerateRandomPosArray << > > (1, numObjects, + dev_pos, scene_scale); + checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + + // LOOK-2.1 computing grid params + gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; + gridSideCount = 2 * halfSideCount; + + gridCellCount = gridSideCount * gridSideCount * gridSideCount; + gridInverseCellWidth = 1.0f / gridCellWidth; + float halfGridWidth = gridCellWidth * halfSideCount; + gridMinimum.x -= halfGridWidth; + gridMinimum.y -= halfGridWidth; + gridMinimum.z -= halfGridWidth; + + // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + + // scattered grid + + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + // coherent grid + + cudaMalloc((void**)&coh_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc coh_pos failed!"); + + cudaMalloc((void**)&coh_vel, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc coh_vel failed!"); + + cudaDeviceSynchronize(); } @@ -181,41 +206,41 @@ void Boids::initSimulation(int N) { * Copy the boid positions into the VBO so that they can be drawn by OpenGL. */ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); + int index = threadIdx.x + (blockIdx.x * blockDim.x); - float c_scale = -1.0f / s_scale; + float c_scale = -1.0f / s_scale; - if (index < N) { - vbo[4 * index + 0] = pos[index].x * c_scale; - vbo[4 * index + 1] = pos[index].y * c_scale; - vbo[4 * index + 2] = pos[index].z * c_scale; - vbo[4 * index + 3] = 1.0f; - } + if (index < N) { + vbo[4 * index + 0] = pos[index].x * c_scale; + vbo[4 * index + 1] = pos[index].y * c_scale; + vbo[4 * index + 2] = pos[index].z * c_scale; + vbo[4 * index + 3] = 1.0f; + } } __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - - if (index < N) { - vbo[4 * index + 0] = vel[index].x + 0.3f; - vbo[4 * index + 1] = vel[index].y + 0.3f; - vbo[4 * index + 2] = vel[index].z + 0.3f; - vbo[4 * index + 3] = 1.0f; - } + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + vbo[4 * index + 0] = vel[index].x + 0.3f; + vbo[4 * index + 1] = vel[index].y + 0.3f; + vbo[4 * index + 2] = vel[index].z + 0.3f; + vbo[4 * index + 3] = 1.0f; + } } /** * Wrapper for call to the kernCopyboidsToVBO CUDA kernel. */ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { - dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO << > > (numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO << > > (numObjects, dev_vel1, vbodptr_velocities, scene_scale); - checkCUDAErrorWithLine("copyBoidsToVBO failed!"); + checkCUDAErrorWithLine("copyBoidsToVBO failed!"); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); } @@ -230,10 +255,46 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * in the `pos` and `vel` arrays. */ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + + glm::vec3 thisPos = pos[iSelf]; + glm::vec3 center, separation, alignment, cohesion; + unsigned int rule1NumNeighbors = 0, rule3NumNeighbors = 0; + + for (int i = 0; i < N; i++) + { + if (i == iSelf) continue; + float disFromNeighbor = glm::length(pos[i] - thisPos); + + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (disFromNeighbor < rule1Distance) { + center += pos[i]; + rule1NumNeighbors++; + } + + // Rule 2: boids try to stay a distance d away from each other + if (disFromNeighbor < rule2Distance) { + separation -= pos[i] - thisPos; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (disFromNeighbor < rule3Distance) { + alignment += vel[i]; + rule3NumNeighbors++; + } + } + + // Average Rule 1 + if (rule1NumNeighbors > 0) { + cohesion = center / ((float)(rule1NumNeighbors)) - thisPos; + } + + // Average Rule 3 + if (rule3NumNeighbors > 0) { + alignment /= rule3NumNeighbors; + } + + return (rule1Scale * cohesion) + (rule2Scale * separation) + (rule3Scale * alignment); + } /** @@ -241,10 +302,23 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po * For each of the `N` bodies, update its position based on its current velocity. */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, - glm::vec3 *vel1, glm::vec3 *vel2) { - // Compute a new velocity based on pos and vel1 - // Clamp the speed - // Record the new velocity into vel2. Question: why NOT vel1? + glm::vec3 *vel1, glm::vec3 *vel2) { + + // Get the body index + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) return; + + // Compute the new velocity based on pos and vel1 + glm::vec3 newVelocity = vel1[index] + computeVelocityChange(N, index, pos, vel1); + + // Clamp the speed + if (glm::length(newVelocity) > maxSpeed) { + newVelocity = maxSpeed * glm::normalize(newVelocity); + } + + // Record the new velocity into vel2, not vel1 or other bodies will use it + vel2[index] = newVelocity; + } /** @@ -252,24 +326,25 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, * For each of the `N` bodies, update its position based on its current velocity. */ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { - // Update position by velocity - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index >= N) { - return; - } - glm::vec3 thisPos = pos[index]; - thisPos += vel[index] * dt; - // Wrap the boids around so we don't lose them - thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; - thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; - thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; + // Update position by velocity + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + thisPos += vel[index] * dt; + + // Wrap the boids around so we don't lose them + thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; + thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; + thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; - thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; - thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; - thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; + thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; + thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; + thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; - pos[index] = thisPos; + pos[index] = thisPos; } // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index. @@ -279,179 +354,499 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { // for(y) // for(z)? Or some other order? __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { - return x + y * gridResolution + z * gridResolution * gridResolution; + return x + y * gridResolution + z * gridResolution * gridResolution; } __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3 *pos, int *indices, int *gridIndices) { + // TODO-2.1 // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + // grid origin is bottom left, we offset by gridMin so the origin is (0, 0, 0) + // inverseCellWidth we are essentially dividing by the cell width to get grid index + glm::ivec3 gridIndex = glm::floor((pos[index] - gridMin) * inverseCellWidth); + gridIndices[index] = gridIndex3Dto1D(gridIndex.x, gridIndex.y, gridIndex.z, gridResolution); + indices[index] = index; } // LOOK-2.1 Consider how this could be useful for indicating that a cell // does not enclose any boids __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - intBuffer[index] = value; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + intBuffer[index] = value; + } } __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" + int *gridCellStartIndices, int *gridCellEndIndices) { + + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + // particleGridIndices takes particle index and gets cell index containing particle + // if particle (index) and particle (index-1) have different cell indices + // then list of consecutive particles for one cell ended, and another began + + int startIndex = particleGridIndices[index - 1]; + int endIndex = particleGridIndices[index]; + + if (index == 0) { + gridCellStartIndices[endIndex] = 0; + } + else if (index == N - 1) { + gridCellEndIndices[endIndex] = N - 1; + } + else if (startIndex != endIndex) { + gridCellEndIndices[startIndex] = index - 1; + gridCellStartIndices[endIndex] = index; + } } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + // - Identify the grid cell that this particle is in + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + // same as computeVelocityChange + glm::vec3 thisPos = pos[index]; + glm::vec3 center, separation, alignment, cohesion; + unsigned int rule1NumNeighbors = 0, rule3NumNeighbors = 0; + + // we find the min neighbor cell by subtracting the radius from thisPos + // we find the max neighbor cell by adding the radius to thisPos + float maxRadius = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance); + // grid origin is bottom left, we offset by gridMin so the origin is (0, 0, 0) + // inverseCellWidth we are essentially dividing by the cell width to get cell index + glm::ivec3 startCellIndex = glm::floor((thisPos - glm::vec3(maxRadius) - gridMin) * inverseCellWidth); + glm::ivec3 endCellIndex = glm::floor((thisPos + glm::vec3(maxRadius) - gridMin) * inverseCellWidth); + // clamp the indices if the cell is on the edge of the grid + startCellIndex = glm::clamp(startCellIndex, glm::ivec3(0), glm::ivec3(gridResolution)); + endCellIndex = glm::clamp(endCellIndex, glm::ivec3(0), glm::ivec3(gridResolution)); + + for (int z = startCellIndex.z; z <= endCellIndex.z; z++) + { + for (int y = startCellIndex.y; y <= endCellIndex.y; y++) + { + for (int x = startCellIndex.x; x <= endCellIndex.x; x++) + { + int neighborCellIndex = gridIndex3Dto1D(x, y, z, gridResolution); + + // iterate through the list of consecutive particles for the neighboring cell + int start = gridCellStartIndices[neighborCellIndex]; + int end = gridCellEndIndices[neighborCellIndex]; + + for (int neighborParticleIndex = start; neighborParticleIndex <= end; neighborParticleIndex++) + { + // safety check just in case index is out of bounds + if (neighborParticleIndex < 0 || neighborParticleIndex >= N) continue; + // do not count the current particle we are looking at + if (neighborParticleIndex == index) continue; + + int neighborParticleInfoIndex = particleArrayIndices[neighborParticleIndex]; + float disFromNeighbor = glm::length(pos[neighborParticleInfoIndex] - thisPos); + + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (disFromNeighbor < rule1Distance) { + center += pos[neighborParticleInfoIndex]; + rule1NumNeighbors++; + } + + // Rule 2: boids try to stay a distance d away from each other + if (disFromNeighbor < rule2Distance) { + separation -= pos[neighborParticleInfoIndex] - thisPos; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (disFromNeighbor < rule3Distance) { + alignment += vel1[neighborParticleInfoIndex]; + rule3NumNeighbors++; + } + + } + } + } + } + + // same as computeVelocityChange + if (rule1NumNeighbors > 0) { + cohesion = center / ((float)(rule1NumNeighbors)) - thisPos; + } + if (rule3NumNeighbors > 0) { + alignment /= rule3NumNeighbors; + } + glm::vec3 thisVel = vel1[index] + (rule1Scale * cohesion) + (rule2Scale * separation) + (rule3Scale * alignment); + + //clamp velocity change + if (glm::length(thisVel) > maxSpeed) { + thisVel = glm::normalize(thisVel) * maxSpeed; + } + + vel2[index] = thisVel; + } __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) +{ + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + // This should expect gridCellStartIndices and gridCellEndIndices to refer + // directly to pos and vel1. + // - Identify the grid cell that this particle is in + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // DIFFERENCE: For best results, consider what order the cells should be + // checked in to maximize the memory benefits of reordering the boids data. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + // same as computeVelocityChange + glm::vec3 thisPos = pos[index]; + glm::vec3 center, separation, alignment, cohesion; + unsigned int rule1NumNeighbors = 0, rule3NumNeighbors = 0; + + // we find the min neighbor cell by subtracting the radius from thisPos + // we find the max neighbor cell by adding the radius to thisPos + float maxRadius = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance); + // grid origin is bottom left, we offset by gridMin so the origin is (0, 0, 0) + // inverseCellWidth we are essentially dividing by the cell width to get cell index + glm::ivec3 startCellIndex = glm::floor((thisPos - glm::vec3(maxRadius) - gridMin) * inverseCellWidth); + glm::ivec3 endCellIndex = glm::floor((thisPos + glm::vec3(maxRadius) - gridMin) * inverseCellWidth); + // clamp the indices if the cell is on the edge of the grid + startCellIndex = glm::clamp(startCellIndex, glm::ivec3(0), glm::ivec3(gridResolution)); + endCellIndex = glm::clamp(endCellIndex, glm::ivec3(0), glm::ivec3(gridResolution)); + + for (int z = startCellIndex.z; z <= endCellIndex.z; z++) + { + for (int y = startCellIndex.y; y <= endCellIndex.y; y++) + { + for (int x = startCellIndex.x; x <= endCellIndex.x; x++) + { + int neighborCellIndex = gridIndex3Dto1D(x, y, z, gridResolution); + + // iterate through the list of consecutive particles for the neighboring cell + int start = gridCellStartIndices[neighborCellIndex]; + int end = gridCellEndIndices[neighborCellIndex]; + + for (int neighborParticleIndex = start; neighborParticleIndex <= end; neighborParticleIndex++) + { + // safety check just in case index is out of bounds + if (neighborParticleIndex < 0 || neighborParticleIndex >= N) continue; + // do not count the current particle we are looking at + if (neighborParticleIndex == index) continue; + + // NOTE IN COHERENT, we use neighborParticleIndex directly + float disFromNeighbor = glm::length(pos[neighborParticleIndex] - thisPos); + + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (disFromNeighbor < rule1Distance) { + center += pos[neighborParticleIndex]; + rule1NumNeighbors++; + } + + // Rule 2: boids try to stay a distance d away from each other + if (disFromNeighbor < rule2Distance) { + separation -= pos[neighborParticleIndex] - thisPos; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (disFromNeighbor < rule3Distance) { + alignment += vel1[neighborParticleIndex]; + rule3NumNeighbors++; + } + + } + } + } + } + + // same as computeVelocityChange + if (rule1NumNeighbors > 0) { + cohesion = center / ((float)(rule1NumNeighbors)) - thisPos; + } + if (rule3NumNeighbors > 0) { + alignment /= rule3NumNeighbors; + } + glm::vec3 thisVel = vel1[index] + (rule1Scale * cohesion) + (rule2Scale * separation) + (rule3Scale * alignment); + + //clamp velocity change + if (glm::length(thisVel) > maxSpeed) { + thisVel = glm::normalize(thisVel) * maxSpeed; + } + + vel2[index] = thisVel; } /** * Step the entire N-body simulation by `dt` seconds. */ -void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers +void Boids::stepSimulationNaive(float dt) +{ + // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!"); + + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + // TODO-1.2 ping-pong the velocity buffers + std::swap(dev_vel1, dev_vel2); } -void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed +void Boids::stepSimulationScatteredGrid(float dt) +{ + // TODO-2.1 + // Uniform Grid Neighbor search using Thrust sort. + // In Parallel: + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + kernComputeIndices<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices); + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::sort_by_key(dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + kernResetIntBuffer<<>>(numObjects, dev_gridCellStartIndices, -1); + kernResetIntBuffer<<>>(numObjects, dev_gridCellEndIndices, -1); + kernIdentifyCellStartEnd<<>>(numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices); + + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_particleArrayIndices, + dev_pos, + dev_vel1, + dev_vel2); + + // - Update positions + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + + // - Ping-pong buffers as needed + std::swap(dev_vel1, dev_vel2); + } -void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. +__global__ void kernShufflePosVel(int N, glm::vec3 *pos, glm::vec3 *vel1, + glm::vec3 *coh_pos, glm::vec3 *coh_vel, int *particleArrayIndices) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + int index2 = particleArrayIndices[index]; + coh_pos[index] = pos[index2]; + coh_vel[index] = vel1[index2]; } -void Boids::endSimulation() { - cudaFree(dev_vel1); - cudaFree(dev_vel2); - cudaFree(dev_pos); +void Boids::stepSimulationCoherentGrid(float dt) +{ + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + // In Parallel: + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + // - Label each particle with its array index as well as its grid index. + kernResetIntBuffer<<>>(numObjects, dev_gridCellStartIndices, -1); + kernResetIntBuffer<<>>(numObjects, dev_gridCellEndIndices, -1); + kernComputeIndices<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices); + + // Use 2x width grids + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + + thrust::sort_by_key(dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + kernIdentifyCellStartEnd<<>>(numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices); + + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + kernShufflePosVel<<>>(numObjects, + dev_pos, + dev_vel1, + coh_pos, + coh_vel, + dev_particleArrayIndices); + + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchCoherent<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + coh_pos, + coh_vel, + dev_vel2); + + // - Update positions + kernUpdatePos<<>>(numObjects, dt, coh_pos, dev_vel2); + checkCUDAErrorWithLine("kernal kernUpdatePos failed!"); + + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + std::swap(dev_pos, coh_pos); + std::swap(dev_vel1, dev_vel2); - // TODO-2.1 TODO-2.3 - Free any additional buffers here. +} + +void Boids::endSimulation() { + cudaFree(dev_vel1); + cudaFree(dev_vel2); + cudaFree(dev_pos); + + // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(coh_pos); + cudaFree(coh_vel); } void Boids::unitTest() { - // LOOK-1.2 Feel free to write additional tests here. - - // test unstable sort - int *dev_intKeys; - int *dev_intValues; - int N = 10; - - std::unique_ptrintKeys{ new int[N] }; - std::unique_ptrintValues{ new int[N] }; - - intKeys[0] = 0; intValues[0] = 0; - intKeys[1] = 1; intValues[1] = 1; - intKeys[2] = 0; intValues[2] = 2; - intKeys[3] = 3; intValues[3] = 3; - intKeys[4] = 0; intValues[4] = 4; - intKeys[5] = 2; intValues[5] = 5; - intKeys[6] = 2; intValues[6] = 6; - intKeys[7] = 0; intValues[7] = 7; - intKeys[8] = 5; intValues[8] = 8; - intKeys[9] = 6; intValues[9] = 9; - - cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); - - cudaMalloc((void**)&dev_intValues, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); - - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - std::cout << "before unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // How to copy data to the GPU - cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - - // Wrap device vectors in thrust iterators for use with thrust. - thrust::device_ptr dev_thrust_keys(dev_intKeys); - thrust::device_ptr dev_thrust_values(dev_intValues); - // LOOK-2.1 Example for using thrust::sort_by_key - thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); - - // How to copy data back to the CPU side from the GPU - cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); - cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); - checkCUDAErrorWithLine("memcpy back failed!"); - - std::cout << "after unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // cleanup - cudaFree(dev_intKeys); - cudaFree(dev_intValues); - checkCUDAErrorWithLine("cudaFree failed!"); - return; + // LOOK-1.2 Feel free to write additional tests here. + + // test unstable sort + int *dev_intKeys; + int *dev_intValues; + int N = 10; + + std::unique_ptrintKeys{ new int[N] }; + std::unique_ptrintValues{ new int[N] }; + + intKeys[0] = 0; intValues[0] = 0; + intKeys[1] = 1; intValues[1] = 1; + intKeys[2] = 0; intValues[2] = 2; + intKeys[3] = 3; intValues[3] = 3; + intKeys[4] = 0; intValues[4] = 4; + intKeys[5] = 2; intValues[5] = 5; + intKeys[6] = 2; intValues[6] = 6; + intKeys[7] = 0; intValues[7] = 7; + intKeys[8] = 5; intValues[8] = 8; + intKeys[9] = 6; intValues[9] = 9; + + cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); + + cudaMalloc((void**)&dev_intValues, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + std::cout << "before unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // How to copy data to the GPU + cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_intKeys); + thrust::device_ptr dev_thrust_values(dev_intValues); + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); + + // How to copy data back to the CPU side from the GPU + cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("memcpy back failed!"); + + std::cout << "after unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // cleanup + cudaFree(dev_intKeys); + cudaFree(dev_intValues); + checkCUDAErrorWithLine("cudaFree failed!"); + return; }