Skip to content

Commit

Permalink
[AMDGPU] Merge SIRemoveShortExecBranches into SIPreEmitPeephole
Browse files Browse the repository at this point in the history
SIRemoveShortExecBranches is an optimisation so fits well in the
context of SIPreEmitPeephole.

Test changes relate to early termination from kills which have now
been lowered prior to considering branches for removal.
As these use s_cbranch the execz skips are now retained instead.
Currently either behaviour is valid as kill with EXEC=0 is a nop;
however, if early termination is used differently in future then
the new behaviour is the correct one.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D98917
  • Loading branch information
perlfu committed Mar 20, 2021
1 parent 8bc3685 commit 5df2af8
Show file tree
Hide file tree
Showing 16 changed files with 211 additions and 234 deletions.
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,6 @@ extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowID;

void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
extern char &SIRemoveShortExecBranchesID;

void initializeSIPreEmitPeepholePass(PassRegistry &);
extern char &SIPreEmitPeepholeID;

Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIRemoveShortExecBranchesPass(*PR);
initializeSIPreEmitPeepholePass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
Expand Down Expand Up @@ -1215,7 +1214,6 @@ void GCNPassConfig::addPreEmitPass() {
if (getOptLevel() > CodeGenOpt::None)
addPass(&SIInsertHardClausesID);

addPass(&SIRemoveShortExecBranchesID);
addPass(&SIInsertSkipsPassID);
addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ add_llvm_target(AMDGPUCodeGen
SIPreEmitPeephole.cpp
SIProgramInfo.cpp
SIRegisterInfo.cpp
SIRemoveShortExecBranches.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
GCNILPSched.cpp
Expand Down
90 changes: 89 additions & 1 deletion llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ using namespace llvm;

#define DEBUG_TYPE "si-pre-emit-peephole"

static unsigned SkipThreshold;

static cl::opt<unsigned, true> SkipThresholdFlag(
"amdgpu-skip-threshold", cl::Hidden,
cl::desc(
"Number of instructions before jumping over divergent control flow"),
cl::location(SkipThreshold), cl::init(12));

namespace {

class SIPreEmitPeephole : public MachineFunctionPass {
Expand All @@ -30,6 +38,13 @@ class SIPreEmitPeephole : public MachineFunctionPass {

bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
bool mustRetainExeczBranch(const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);

public:
static char ID;
Expand Down Expand Up @@ -258,24 +273,97 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
return true;
}

bool SIPreEmitPeephole::getBlockDestinations(
MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
return false;

if (!FalseMBB)
FalseMBB = SrcMBB.getNextNode();

return true;
}

bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();

for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;

for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might never be taken when EXEC = 0.
// Hence we should retain cbranch out of the loop lest it become infinite.
if (I->isConditionalBranch())
return true;

if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
return true;

// These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
return true;

++NumInstr;
if (NumInstr >= SkipThreshold)
return true;
}
}

return false;
}

// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
MachineBasicBlock *TrueMBB = nullptr;
MachineBasicBlock *FalseMBB = nullptr;
SmallVector<MachineOperand, 1> Cond;

if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
return false;

// Consider only the forward branches.
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
return false;

LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
MI.eraseFromParent();
SrcMBB.removeSuccessor(TrueMBB);

return true;
}

bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
bool Changed = false;

MF.RenumberBlocks();

for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
MachineBasicBlock::iterator TermI = MBBE;
// Check first terminator for VCC branches to optimize
// Check first terminator for branches to optimize
if (TermI != MBB.end()) {
MachineInstr &MI = *TermI;
switch (MI.getOpcode()) {
case AMDGPU::S_CBRANCH_VCCZ:
case AMDGPU::S_CBRANCH_VCCNZ:
Changed |= optimizeVccBranch(MI);
continue;
case AMDGPU::S_CBRANCH_EXECZ:
Changed |= removeExeczBranch(MI, MBB);
continue;
default:
break;
}
Expand Down
159 changes: 0 additions & 159 deletions llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp

This file was deleted.

Loading

0 comments on commit 5df2af8

Please sign in to comment.