-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU]: Fall back to default mutations when iglp is not applied #93418
base: main
Are you sure you want to change the base?
Conversation
Change-Id: I2e1f4f4610275d3d629c2b34ced331d78ea0ca06
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesSome regions may have IGLPInstrs but no IGLP mutation opportunities. In these regions we should fall back to the default mutations which may still improve performance. This is especially useful for more selective mutations like iglp_opt(2). This is a second attempt of this feature, with the first attempt being part of 8f2bd8a . The issue with that attempt was that in the postRA scheduler, we didn't restore the saved mutations per region (::schedule is called per region, and ::finalizeSchedule is called per block). So, in the case of multiple regions with IGLP instructions in the same block, for the second+ region, IGroupLP would fall back to an IGroupLPDAGMutation that had a SavedMutations which contained itself. Full diff: https://github.com/llvm/llvm-project/pull/93418.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 57769fe998d1f..e3f7248507953 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2337,6 +2337,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
ScheduleDAGMI *DAG;
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
+
// Organize lists of SchedGroups by their SyncID. SchedGroups /
// SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
// between then.
@@ -2379,7 +2381,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
IGroupLPDAGMutation() = default;
- IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
+ IGroupLPDAGMutation(
+ AMDGPU::SchedulingPhase Phase,
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
+ : SavedMutations(SavedMutations), Phase(Phase) {}
};
unsigned SchedGroup::NumSchedGroups = 0;
@@ -2597,6 +2602,13 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
PS.solve();
return;
}
+
+ if (!SavedMutations)
+ return;
+
+ // We did not apply a mutation, fall back to SavedMutations
+ for (auto &m : *SavedMutations)
+ m->apply(DAG);
}
void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -2695,9 +2707,10 @@ namespace llvm {
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
/// scheduling "phases"), we can reenter this mutation framework more than once
/// for a given region.
-std::unique_ptr<ScheduleDAGMutation>
-createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
- return std::make_unique<IGroupLPDAGMutation>(Phase);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+ AMDGPU::SchedulingPhase Phase,
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
+ return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
}
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096f26d67..46ef4d702d002 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -20,8 +20,9 @@ namespace AMDGPU {
enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
} // namespace AMDGPU
-std::unique_ptr<ScheduleDAGMutation>
-createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+ AMDGPU::SchedulingPhase Phase,
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 20329dea60275..b9a0cfc5b130d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -471,7 +471,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+ DAG->addMutation(
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -481,7 +482,8 @@ static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
- DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+ DAG->addMutation(
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
return DAG;
}
@@ -893,7 +895,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(
- createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
DAG->addMutation(createVOPDPairingMutation());
return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 94d93390d0916..3882ab4cf58bf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
return false;
SavedMutations.swap(DAG.Mutations);
- DAG.addMutation(
- createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
+ DAG.addMutation(createIGroupLPDAGMutation(
+ AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -858,7 +858,8 @@ bool GCNSchedStage::initGCNRegion() {
StageID == GCNSchedStageID::ILPInitialSchedule;
DAG.addMutation(createIGroupLPDAGMutation(
IsInitialStage ? AMDGPU::SchedulingPhase::Initial
- : AMDGPU::SchedulingPhase::PreRAReentry));
+ : AMDGPU::SchedulingPhase::PreRAReentry,
+ &SavedMutations));
}
return true;
@@ -1577,15 +1578,16 @@ void GCNPostScheduleDAGMILive::schedule() {
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
- addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
+ addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
+ &SavedMutations));
}
ScheduleDAGMI::schedule();
-}
-void GCNPostScheduleDAGMILive::finalizeSchedule() {
if (HasIGLPInstrs)
SavedMutations.swap(Mutations);
+}
+void GCNPostScheduleDAGMILive::finalizeSchedule() {
ScheduleDAGMI::finalizeSchedule();
}
diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
index 0d84dc0bdc53e..287e58dfc536e 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
+++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
@@ -18,3 +18,23 @@ body: |
%2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
%3 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32))
...
+---
+# GCN-LABEL: name: cluster_flat_loads_iglp_opt
+# GCN: FLAT_LOAD_DWORD %0, 0
+# GCN-NEXT: FLAT_LOAD_DWORD %0, 4
+# GCN-NEXT: V_ADD_F32_e64
+name: cluster_flat_loads_iglp_opt
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+body: |
+ bb.0:
+ %0 = IMPLICIT_DEF
+ %1 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
+ %3 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ IGLP_OPT 2
+...
|
Change-Id: I55fe6465aed5b6d78f7e3f56db3b3062a08d429b
@@ -2337,6 +2337,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { | |||
|
|||
ScheduleDAGMI *DAG; | |||
|
|||
std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not obvious to me why this is a pointer to a vector. Can this just be an ArrayRef? Why does the mutation have to manage the other mutations? I thought all the mutations were added sequentially already, where this would just be one in the set
Some regions may have IGLPInstrs but no IGLP mutation opportunities. In these regions we should fall back to the default mutations which may still improve performance. This is especially useful for more selective mutations like iglp_opt(2).
This is a second attempt of this feature, with the first attempt being part of 8f2bd8a . The issue with that attempt was that in the postRA scheduler we didn't restore the saved mutations per region (::schedule is called per region, and ::finalizeSchedule is called per block). So, in the case of multiple regions with IGLP instructions in the same block, the second+ region's IGroupLPDAGMutation would fall back to an IGroupLPDAGMutation that had a SavedMutations which contained itself.