diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 57769fe998d1fe..e3f72485079538 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2337,6 +2337,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { ScheduleDAGMI *DAG; + std::vector> *SavedMutations; + // Organize lists of SchedGroups by their SyncID. SchedGroups / // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added // between then. @@ -2379,7 +2381,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; IGroupLPDAGMutation() = default; - IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {} + IGroupLPDAGMutation( + AMDGPU::SchedulingPhase Phase, + std::vector> *SavedMutations) + : SavedMutations(SavedMutations), Phase(Phase) {} }; unsigned SchedGroup::NumSchedGroups = 0; @@ -2597,6 +2602,13 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { PS.solve(); return; } + + if (!SavedMutations) + return; + + // We did not apply a mutation, fall back to SavedMutations + for (auto &m : *SavedMutations) + m->apply(DAG); } void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { @@ -2695,9 +2707,10 @@ namespace llvm { /// same scheduling region (e.g. pre and post-RA scheduling / multiple /// scheduling "phases"), we can reenter this mutation framework more than once /// for a given region. -std::unique_ptr -createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { - return std::make_unique(Phase); +std::unique_ptr createIGroupLPDAGMutation( + AMDGPU::SchedulingPhase Phase, + std::vector> *SavedMutations) { + return std::make_unique(Phase, SavedMutations); } } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index aff7096f26d671..46ef4d702d0022 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -20,8 +20,9 @@ namespace AMDGPU { enum class SchedulingPhase { Initial, PreRAReentry, PostRA }; } // namespace AMDGPU -std::unique_ptr -createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase); +std::unique_ptr createIGroupLPDAGMutation( + AMDGPU::SchedulingPhase Phase, + std::vector> *SavedMutations); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 20329dea602759..b9a0cfc5b130d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -471,7 +471,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); + DAG->addMutation( + createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -481,7 +482,8 @@ static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); - DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); + DAG->addMutation( + createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr)); return DAG; } @@ -893,7 +895,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation( - createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); + createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr)); if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) DAG->addMutation(createVOPDPairingMutation()); return DAG; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 94d93390d0916f..3882ab4cf58bff 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { return false; SavedMutations.swap(DAG.Mutations); - DAG.addMutation( - createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); + DAG.addMutation(createIGroupLPDAGMutation( + AMDGPU::SchedulingPhase::PreRAReentry, nullptr)); InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP @@ -858,7 +858,8 @@ bool GCNSchedStage::initGCNRegion() { StageID == GCNSchedStageID::ILPInitialSchedule; DAG.addMutation(createIGroupLPDAGMutation( IsInitialStage ? AMDGPU::SchedulingPhase::Initial - : AMDGPU::SchedulingPhase::PreRAReentry)); + : AMDGPU::SchedulingPhase::PreRAReentry, + &SavedMutations)); } return true; @@ -1577,15 +1578,16 @@ void GCNPostScheduleDAGMILive::schedule() { if (HasIGLPInstrs) { SavedMutations.clear(); SavedMutations.swap(Mutations); - addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); + addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, + &SavedMutations)); } ScheduleDAGMI::schedule(); -} -void GCNPostScheduleDAGMILive::finalizeSchedule() { if (HasIGLPInstrs) SavedMutations.swap(Mutations); +} +void GCNPostScheduleDAGMILive::finalizeSchedule() { ScheduleDAGMI::finalizeSchedule(); } diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir index 0d84dc0bdc53ef..ae614beb8d3a1e 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir +++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir @@ -6,15 +6,25 @@ # GCN-NEXT: V_ADD_F32_e64 name: cluster_flat_loads tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vgpr_32 } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_32 } body: | bb.0: - %0 = IMPLICIT_DEF - %1 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) - %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec - %3 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32)) + %0:vreg_64 = IMPLICIT_DEF + %1:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) + %2:vgpr_32 = V_ADD_F32_e64 0, killed %1:vgpr_32, 0, 1, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32)) +... +--- +# GCN-LABEL: name: cluster_flat_loads_iglp_opt +# GCN: FLAT_LOAD_DWORD %0, 0 +# GCN-NEXT: FLAT_LOAD_DWORD %0, 4 +# GCN-NEXT: V_ADD_F32_e64 +name: cluster_flat_loads_iglp_opt +tracksRegLiveness: true +body: | + bb.0: + %0:vreg_64 = IMPLICIT_DEF + %1:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32)) + %2:vgpr_32 = V_ADD_F32_e64 0, killed %1:vgpr_32, 0, 1, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = FLAT_LOAD_DWORD %0:vreg_64, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32)) + IGLP_OPT 2 ...