Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flexible GPU and CPU Usage for Gravity #170

Merged
merged 23 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3806d32
TreePieces and ListCompute can execute code for CPU tree walk when CU…
spencerw May 9, 2024
a2116c2
Move repeated code into separate functions
spencerw May 10, 2024
5250226
Minimal working version, gravity is done on CPU when CUDA flag enabled
spencerw May 10, 2024
1b56b0b
Main signals whether gravity happens on GPU based on rung population
spencerw May 10, 2024
4f6ec8d
Missing ifdef around DataManager::unmarkTreePiecesForCleanup
spencerw May 13, 2024
b803652
Fix issue with hang during calculateEwald on CPU step
spencerw May 21, 2024
81c97f9
Fix indentation in ListCompute::stateReady
spencerw May 21, 2024
3876e9e
Remove TODO comments near useckloop checks
spencerw May 21, 2024
8207f13
Remove initAccel() call before first big step
spencerw May 21, 2024
a995757
Remove comments about resetting bUseGpu
spencerw May 21, 2024
32584d8
Dont set lpdata in TreePiece::nextBucket if using CUDA
spencerw May 21, 2024
8f6ffe0
Misspelled ewaldCPU
spencerw May 23, 2024
5f7275e
Replace bUseGpu with bUseCpu
spencerw Jun 5, 2024
2d7a5d9
Fixes to comments
spencerw Jun 11, 2024
e4a6d22
Restructure ifdef's to get rid of schedCpuWalk
spencerw Jun 11, 2024
d62a1dc
Rearrange arguments in TreePiece::startGravity
spencerw Jun 11, 2024
3f357cb
Misc fixes
spencerw Jun 11, 2024
4cb476f
Remove unnecessary ifdef's
spencerw Jun 11, 2024
8a049e8
Forgot to switch startGravity parameters in .ci file
spencerw Jun 11, 2024
61280bd
Remove extraneous enableCpu call
spencerw Jun 11, 2024
71d092d
Fix comment
spencerw Jun 11, 2024
a5378ca
Incorporate changes from spencerw:multiple_gpu
spencerw Jun 11, 2024
428b14a
Fix startGravity callback for non-concurrent SPH
spencerw Jun 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
682 changes: 353 additions & 329 deletions Compute.cpp

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion Compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ class GravityCompute : public Compute{
class ListCompute : public Compute{

public:
ListCompute() : Compute(List) {}
ListCompute() : Compute(List) {
bUseCpu = 0;
}

int doWork(GenericTreeNode *, TreeWalk *tw, State *state, int chunk, int reqID, bool isRoot, bool &didcomp, int awi);

Expand Down Expand Up @@ -179,6 +181,8 @@ class ListCompute : public Compute{
State *getNewState();
void freeState(State *state);
void freeDoubleWalkState(DoubleWalkState *state);
/// Flag the cpu (instead of gpu) for usage for the next walk
void enableCpu() {bUseCpu = 1;}
spencerw marked this conversation as resolved.
Show resolved Hide resolved

#ifdef CUDA
#ifdef GPU_LOCAL_TREE_WALK
Expand All @@ -194,6 +198,8 @@ class ListCompute : public Compute{
void addNodeToInt(GenericTreeNode *node, int offsetID, DoubleWalkState *s);

DoubleWalkState *allocDoubleWalkState();
/// used to flag cpu (instead of gpu) for usage when compiling with CUDA
int bUseCpu;

#if defined CHANGA_REFACTOR_PRINT_INTERACTIONS || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST || defined CUDA
void addRemoteParticlesToInt(ExternalGravityParticle *parts, int n,
Expand Down
19 changes: 16 additions & 3 deletions DataManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void DataManager::init() {
root = NULL;
oldNumChunks = 0;
chunkRoots = NULL;
cleanupTreePieces = true;
#ifdef CUDA
treePiecesDone = 0;
treePiecesDonePrefetch = 0;
Expand Down Expand Up @@ -161,9 +162,20 @@ void DataManager::notifyPresence(Tree::GenericTreeNode *root, TreePiece *tp) {
/// \brief Clear registeredTreePieces on this node.
void DataManager::clearRegisteredPieces(const CkCallback& cb) {
registeredTreePieces.removeAll();
cleanupTreePieces = true;
contribute(cb);
}

#ifdef CUDA
// This gets called before a tree build happens and ensures that
// registeredTreePieces doesnt get cleared during combineLocalTrees
// if we are about to do a gravity calculation on the GPU
void DataManager::unmarkTreePiecesForCleanup(const CkCallback& cb) {
cleanupTreePieces = false;
contribute(cb);
}
#endif


/// \brief Build a local tree inside the node.
///
Expand Down Expand Up @@ -211,9 +223,9 @@ void DataManager::combineLocalTrees(CkReductionMsg *msg) {
}
root = buildProcessorTree(totalChares, &gtn[0]);

#ifndef CUDA
registeredTreePieces.removeAll();
#endif
if (cleanupTreePieces) {
registeredTreePieces.removeAll();
}

#ifdef PRINT_MERGED_TREE
ostringstream dmName;
Expand Down Expand Up @@ -996,6 +1008,7 @@ void DataManager::transferParticleVarsBack(){
cudaFree(d_localVars);
cudaFree(d_remoteMoments);
cudaFree(d_remoteParts);
cleanupTreePieces = true;

#ifdef CUDA_PRINT_ERRORS
printf("transferParticleVarsBack: %s\n", cudaGetErrorString( cudaGetLastError() ) );
Expand Down
4 changes: 4 additions & 0 deletions DataManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ class DataManager : public CBase_DataManager {
/// A list of roots of the TreePieces in this node
// holds chare array indices of registered treepieces
CkVec<TreePieceDescriptor> registeredTreePieces;
/// Signal whether registeredTreePieces needs to be cleaned
/// when combining local trees
bool cleanupTreePieces;
#ifdef CUDA
//CkVec<int> registeredTreePieceIndices;
/// @brief counter for the number of tree nodes that are
Expand Down Expand Up @@ -252,6 +255,7 @@ class DataManager : public CBase_DataManager {
std::map<NodeKey, int> &getCachedPartsOnGpuTable(){
return cachedPartsOnGpu;
}
void unmarkTreePiecesForCleanup(const CkCallback& cb);
#endif
// Functions used to create a tree inside the DataManager comprising
// all the trees in the TreePieces in the local node
Expand Down
4 changes: 3 additions & 1 deletion ParallelGravity.ci
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ mainmodule ParallelGravity {
entry void startLocalWalk();
entry void resumeRemoteChunk();
entry void createStreams(int _numStreams, const CkCallback& cb);
entry void unmarkTreePiecesForCleanup(const CkCallback& cb);
#endif
entry void initCooling(double dGmPerCcUnit, double dComovingGmPerCcUnit,
double dErgPerGmUnit, double dSecUnit, double dKpcUnit,
Expand Down Expand Up @@ -459,7 +460,8 @@ mainmodule ParallelGravity {

entry void startORBTreeBuild(CkReductionMsg* m);

entry void startGravity(int activeRung, double myTheta, const CkCallback &cb);
entry void startGravity(int activeRung, int bUseCpu_, double myTheta, const CkCallback &cb);

#ifdef PUSH_GRAVITY
entry void startPushGravity(int am, double myTheta);
entry void recvPushBuckets(BucketMsg *);
Expand Down
49 changes: 35 additions & 14 deletions ParallelGravity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,9 @@ Main::Main(CkArgMsg* m) {
numStreams = 100;
prmAddParam(prm, "nStreams", paramInt, &numStreams,
sizeof(int),"str", "Number of CUDA streams (default: 100)");
param.nGpuMinParts = 1000;
prmAddParam(prm, "nGpuMinParts", paramInt, &param.nGpuMinParts,
sizeof(int),"gpup", "Min particles on rung to trigger GPU (default: 1000)");
#endif
particlesPerChare = 0;
prmAddParam(prm, "nPartPerChare", paramInt, &particlesPerChare,
Expand Down Expand Up @@ -1713,6 +1716,13 @@ Main::loadBalance(int iPhase)
/// @param iPhase Active rung (or phase).
void Main::buildTree(int iPhase)
{
#ifdef CUDA
// If we are about to use the GPU, tell the data manager
// not to clean up its TreePiece list during combineLocalTrees
if (nActiveGrav >= param.nGpuMinParts) {
dMProxy.unmarkTreePiecesForCleanup(CkCallbackResumeThread());
}
#endif
#ifdef PUSH_GRAVITY
bool bDoPush = param.dFracPushParticles*nTotalParticles > nActiveGrav;
if(bDoPush) CkPrintf("[main] fracActive %f PUSH_GRAVITY\n", 1.0*nActiveGrav/nTotalParticles);
Expand Down Expand Up @@ -1752,6 +1762,9 @@ void Main::startGravity(const CkCallback& cbGravity, int iActiveRung,
turnProjectionsOn(iActiveRung);
#endif

#ifdef CUDA
spencerw marked this conversation as resolved.
Show resolved Hide resolved
if (nActiveGrav > param.nGpuMinParts) CkPrintf("Gravity will be calculated on the GPU\n");
#endif
CkPrintf("Calculating gravity (tree bucket, theta = %f) ... ", theta);
*startTime = CkWallTimer();
if(param.bConcurrentSph) {
Expand All @@ -1761,7 +1774,12 @@ void Main::startGravity(const CkCallback& cbGravity, int iActiveRung,
}
else{
#endif
treeProxy.startGravity(iActiveRung, theta, cbGravity);
int bUseCpu = 1;
#ifdef CUDA
bUseCpu = nActiveGrav < param.nGpuMinParts;
#endif
treeProxy.startGravity(iActiveRung, bUseCpu, theta, cbGravity);

#ifdef PUSH_GRAVITY
}
#endif
Expand All @@ -1774,7 +1792,13 @@ void Main::startGravity(const CkCallback& cbGravity, int iActiveRung,
}
else{
#endif
treeProxy.startGravity(iActiveRung, theta, CkCallbackResumeThread());

int bUseCpu = 1;
#ifdef CUDA
bUseCpu = nActiveGrav < param.nGpuMinParts;
#endif
treeProxy.startGravity(iActiveRung, bUseCpu, theta, CkCallbackResumeThread());

#ifdef PUSH_GRAVITY
}
#endif
Expand All @@ -1795,7 +1819,9 @@ void Main::startGravity(const CkCallback& cbGravity, int iActiveRung,
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif
}
}
Expand Down Expand Up @@ -2718,15 +2744,6 @@ Main::initialForces()
if(verbosity)
memoryStats();

#ifdef CUDA
ckout << "Init. Accel. ...";
double dInitAccelTime = CkWallTimer();
treeProxy.initAccel(0, CkCallbackResumeThread());
ckout << " took " << (CkWallTimer() - dInitAccelTime) << " seconds."
<< endl;
#endif


CkCallback cbGravity(CkCallback::resumeThread); // needed below to wait for gravity

double gravStartTime;
Expand Down Expand Up @@ -3649,7 +3666,9 @@ void Main::writeOutput(int iStep)
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif
if(verbosity) {
ckout << " took " << (CkWallTimer() - startTime) << " seconds."
Expand Down Expand Up @@ -3690,7 +3709,9 @@ void Main::writeOutput(int iStep)
#ifdef CUDA
// We didn't do gravity where the registered TreePieces on the
// DataManager normally get cleared. Clear them here instead.
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
if (nActiveGrav > param.nGpuMinParts) {
dMProxy.clearRegisteredPieces(CkCallbackResumeThread());
}
#endif
if(verbosity)
ckout << " took " << (CkWallTimer() - startTime) << " seconds."
Expand Down
8 changes: 7 additions & 1 deletion ParallelGravity.h
Original file line number Diff line number Diff line change
Expand Up @@ -1200,6 +1200,9 @@ class TreePiece : public CBase_TreePiece {
/// The current active mask for force computation in multistepping
int activeRung;

/// Whether the GPU or CPU is to be used on the current gravity substep
int bUseCpu;

/// Periodic Boundary stuff
int bPeriodic;
int bComove;
Expand Down Expand Up @@ -1457,6 +1460,7 @@ class TreePiece : public CBase_TreePiece {
#if INTERLIST_VER > 0
sInterListWalk = NULL;
#endif
bUseCpu = 1;
#ifdef CUDA
numActiveBuckets = -1;
#ifdef HAPI_TRACE
Expand Down Expand Up @@ -1583,6 +1587,7 @@ class TreePiece : public CBase_TreePiece {
int bComove, double dRhoFac);
void BucketEwald(GenericTreeNode *req, int nReps,double fEwCut);
void EwaldInit();
void ewaldCPU(EwaldMsg *msg);
void calculateEwald(EwaldMsg *m);
void calculateEwaldUsingCkLoop(int yield_num);
void callBucketEwald(int id);
Expand Down Expand Up @@ -1898,8 +1903,9 @@ class TreePiece : public CBase_TreePiece {
/// @brief Start a tree based gravity computation.
/// @param am the active rung for the computation
/// @param theta the opening angle
/// @param bUseCpu_ whether the cpu or gpu is being used
/// @param cb the callback to use after all the computation has finished
void startGravity(int am, double myTheta, const CkCallback& cb);
void startGravity(int am, int bUseCpu_, double myTheta, const CkCallback& cb);
/// Setup utility function for all the smooths. Initializes caches.
void setupSmooth();
/// Start a tree based smooth computation.
Expand Down
Loading
Loading