-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Late temporal divergence lowering for SDAG #67033
Changes from all commits
2a1d4c3
7a1b800
96d5dbb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -341,6 +341,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |
using DivergenceDescriptorT = | ||
typename SyncDependenceAnalysisT::DivergenceDescriptor; | ||
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; | ||
using UseOutsideCycleInfoT = | ||
typename std::tuple<ConstValueRefT, const InstructionT *, | ||
SmallVector<BlockT *, 4>>; | ||
|
||
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, | ||
const TargetTransformInfo *TTI) | ||
|
@@ -396,6 +399,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |
|
||
void print(raw_ostream &out) const; | ||
|
||
iterator_range<const UseOutsideCycleInfoT *> uses_outside_cycle() const; | ||
|
||
protected: | ||
/// \brief Value/block pair representing a single phi input. | ||
struct PhiInput { | ||
|
@@ -427,6 +432,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |
|
||
// Recognized cycles with divergent exits. | ||
SmallPtrSet<const CycleT *, 16> DivergentExitCycles; | ||
SmallVector<UseOutsideCycleInfoT, 4> UsesOutsideCycle; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it correct that this only tracks uses outside a cycle with divergent exits? This should be in the name or at least in a comment. |
||
|
||
// Cycles assumed to be divergent. | ||
// | ||
|
@@ -470,6 +476,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |
/// \brief Whether \p Def is divergent when read in \p ObservingBlock. | ||
bool isTemporalDivergent(const BlockT &ObservingBlock, | ||
const InstructionT &Def) const; | ||
|
||
void recordUseOutsideCycle(ConstValueRefT Src, const InstructionT *UserInstr, | ||
const CycleT &DefCycle); | ||
}; | ||
|
||
template <typename ImplT> | ||
|
@@ -1210,6 +1219,18 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const { | |
} | ||
} | ||
|
||
template <typename ContextT> | ||
using UseOutsideCycleInfoT = | ||
typename std::tuple<typename ContextT::ConstValueRefT, | ||
const typename ContextT::InstructionT *, | ||
SmallVector<typename ContextT::BlockT *, 4>>; | ||
|
||
template <typename ContextT> | ||
iterator_range<const UseOutsideCycleInfoT<ContextT> *> | ||
GenericUniformityAnalysisImpl<ContextT>::uses_outside_cycle() const { | ||
return make_range(UsesOutsideCycle.begin(), UsesOutsideCycle.end()); | ||
} | ||
|
||
template <typename ContextT> | ||
bool GenericUniformityInfo<ContextT>::hasDivergence() const { | ||
return DA->hasDivergence(); | ||
|
@@ -1248,6 +1269,12 @@ void GenericUniformityInfo<ContextT>::print(raw_ostream &out) const { | |
DA->print(out); | ||
} | ||
|
||
template <typename ContextT> | ||
iterator_range<const UseOutsideCycleInfoT<ContextT> *> | ||
GenericUniformityInfo<ContextT>::uses_outside_cycle() const { | ||
return DA->uses_outside_cycle(); | ||
} | ||
|
||
template <typename ContextT> | ||
void llvm::ModifiedPostOrder<ContextT>::computeStackPO( | ||
SmallVectorImpl<const BlockT *> &Stack, const CycleInfoT &CI, | ||
|
@@ -1367,6 +1394,14 @@ void llvm::ModifiedPostOrder<ContextT>::compute(const CycleInfoT &CI) { | |
computeStackPO(Stack, CI, nullptr, Finalized); | ||
} | ||
|
||
template <typename ContextT> | ||
void GenericUniformityAnalysisImpl<ContextT>::recordUseOutsideCycle( | ||
ConstValueRefT Src, const InstructionT *UserInstr, const CycleT &DefCycle) { | ||
SmallVector<BlockT *, 4> TmpExitBlocks; | ||
DefCycle.getExitBlocks(TmpExitBlocks); | ||
UsesOutsideCycle.push_back({Src, UserInstr, TmpExitBlocks}); | ||
} | ||
|
||
} // namespace llvm | ||
|
||
#undef DEBUG_TYPE | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
//===- AMDGPUTemporalDivergenceLowering.cpp -------------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "AMDGPU.h" | ||
#include "GCNSubtarget.h" | ||
#include "llvm/CodeGen/MachineUniformityAnalysis.h" | ||
#include "llvm/InitializePasses.h" | ||
|
||
#define DEBUG_TYPE "temporal-divergence-lowering" | ||
|
||
using namespace llvm; | ||
|
||
namespace { | ||
|
||
class AMDGPUTemporalDivergenceLowering : public MachineFunctionPass { | ||
public: | ||
static char ID; | ||
|
||
public: | ||
AMDGPUTemporalDivergenceLowering() : MachineFunctionPass(ID) { | ||
initializeAMDGPUTemporalDivergenceLoweringPass( | ||
*PassRegistry::getPassRegistry()); | ||
} | ||
|
||
bool runOnMachineFunction(MachineFunction &MF) override; | ||
|
||
StringRef getPassName() const override { | ||
return "Temporal divergence lowering"; | ||
} | ||
|
||
void getAnalysisUsage(AnalysisUsage &AU) const override { | ||
AU.setPreservesCFG(); | ||
AU.addRequired<MachineCycleInfoWrapperPass>(); | ||
AU.addRequired<MachineDominatorTree>(); | ||
MachineFunctionPass::getAnalysisUsage(AU); | ||
} | ||
}; | ||
|
||
} // End anonymous namespace. | ||
|
||
INITIALIZE_PASS_BEGIN(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE, | ||
"Temporal divergence lowering", false, false) | ||
INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) | ||
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) | ||
INITIALIZE_PASS_END(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE, | ||
"Temporal divergence lowering", false, false) | ||
|
||
char AMDGPUTemporalDivergenceLowering::ID = 0; | ||
|
||
char &llvm::AMDGPUTemporalDivergenceLoweringID = | ||
AMDGPUTemporalDivergenceLowering::ID; | ||
|
||
FunctionPass *llvm::createAMDGPUTemporalDivergenceLoweringPass() { | ||
return new AMDGPUTemporalDivergenceLowering(); | ||
} | ||
|
||
static void replaceUseRegisterWith(const MachineInstr *MI, Register Reg, | ||
Register Newreg) { | ||
for (unsigned i = 0; i < MI->getNumOperands(); ++i) { | ||
const MachineOperand &Op = MI->getOperand(i); | ||
if (Op.isReg() && Op.getReg() == Reg) { | ||
const_cast<MachineInstr *>(MI)->getOperand(i).setReg(Newreg); | ||
} | ||
} | ||
} | ||
// Get poiners to build instruction just after MI (skips phis if needed) | ||
static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator> | ||
getInsertAfterPtrs(MachineInstr *MI) { | ||
MachineBasicBlock *InsertMBB = MI->getParent(); | ||
return std::make_pair( | ||
InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator()))); | ||
} | ||
|
||
bool AMDGPUTemporalDivergenceLowering::runOnMachineFunction( | ||
MachineFunction &MF) { | ||
|
||
MachineCycleInfo &CycleInfo = | ||
getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo(); | ||
MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>(); | ||
|
||
MachineUniformityInfo MUI = | ||
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), true); | ||
|
||
MachineRegisterInfo &MRI = MF.getRegInfo(); | ||
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); | ||
const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); | ||
const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); | ||
|
||
// Temporal divergence lowering is required for uniform UniformSourceReg | ||
// and divergent UserInstr. UserInstr is uniform only when loop is uniform. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes should consistently use the term "cycle" and not "loop" everywhere. What does this comment mean? Is it a pre-condition or a post-condition for this pass? By "loop is uniform", do you mean the cycle does not have divergent exits? Since UserInstr is outside the cycle, it could be divergent for any other reason too, like some operand which is not inside the cycle. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was meant to be a simple comment explaining when we need to insert a copy to vgpr, also it is there to give better context for fix me comment below. UserInstr in reported as uniform in some cases (cycle had divergent exit) I would expect that it was enough to ask if SrcReg was uniform (UserInstr should be divergent because of temporal divergence unless it was a uniform loop) However, machine uniformity analysis detects all temporal divergence cases that require lowering.
I am not sure about the terminology can you point me some reference? But at this point cycles have exactly one entry point (is that good enough to call them natural loops?) and exactly one exit. I only looked in tests where something was wrong and all of them had "divergent exit from the cycle" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I finally parsed this statement correctly: "UserInstr is uniform only when loop is uniform." I got confused by the "only when". About the term "loop", how do you know that every cycle is reducible at this point? This pass is fairly generic, and could be moved around. If this pass actually assumes that every cycle is reducible, then there should be asserts about that. Also separately, instead of saying "is uniform", it's less confusing to say "has divergent exits". |
||
for (auto [SrcReg, UserInstr, CycleExitBlocks] : MUI.uses_outside_cycle()) { | ||
if (!MUI.isUniform(SrcReg) || !MUI.isDivergent(UserInstr)) | ||
continue; | ||
|
||
MachineInstr *UniformSourceInstr = MRI.getVRegDef(SrcReg); | ||
|
||
// FixMe: SrcReg is lane mask in this case. Find a better way to detect it. | ||
if (UniformSourceInstr->getOpcode() == AMDGPU::SI_IF_BREAK || | ||
UserInstr->getOpcode() == AMDGPU::SI_IF) | ||
continue; | ||
|
||
unsigned Size = TRI.getRegSizeInBits(*MRI.getRegClassOrNull(SrcReg)); | ||
Register VgprDst = | ||
MRI.createVirtualRegister(TRI.getVGPRClassForBitWidth(Size)); | ||
|
||
auto [MBB, AfterUniformSourceReg] = getInsertAfterPtrs(UniformSourceInstr); | ||
BuildMI(*MBB, AfterUniformSourceReg, {}, TII.get(AMDGPU::COPY)) | ||
.addDef(VgprDst) | ||
.addReg(SrcReg) | ||
.addReg(AMDGPU::EXEC, RegState::Implicit); | ||
|
||
replaceUseRegisterWith(UserInstr, SrcReg, VgprDst); | ||
} | ||
|
||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it correct that this only tracks uses outside a cycle with divergent exits? This should be in the name or at least in a comment.