diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index ddd0746ccd9163..ee3fd1b56bc696 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -341,6 +341,9 @@ template class GenericUniformityAnalysisImpl { using DivergenceDescriptorT = typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using UseOutsideCycleInfoT = + typename std::tuple>; GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) @@ -396,6 +399,8 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + iterator_range uses_outside_cycle() const; + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -427,6 +432,7 @@ template class GenericUniformityAnalysisImpl { // Recognized cycles with divergent exits. SmallPtrSet DivergentExitCycles; + SmallVector UsesOutsideCycle; // Cycles assumed to be divergent. // @@ -470,6 +476,9 @@ template class GenericUniformityAnalysisImpl { /// \brief Whether \p Def is divergent when read in \p ObservingBlock. bool isTemporalDivergent(const BlockT &ObservingBlock, const InstructionT &Def) const; + + void recordUseOutsideCycle(ConstValueRefT Src, const InstructionT *UserInstr, + const CycleT &DefCycle); }; template @@ -1210,6 +1219,18 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +using UseOutsideCycleInfoT = + typename std::tuple>; + +template +iterator_range *> +GenericUniformityAnalysisImpl::uses_outside_cycle() const { + return make_range(UsesOutsideCycle.begin(), UsesOutsideCycle.end()); +} + template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); @@ -1248,6 +1269,12 @@ void GenericUniformityInfo::print(raw_ostream &out) const { DA->print(out); } +template +iterator_range *> +GenericUniformityInfo::uses_outside_cycle() const { + return DA->uses_outside_cycle(); +} + template void llvm::ModifiedPostOrder::computeStackPO( SmallVectorImpl &Stack, const CycleInfoT &CI, @@ -1367,6 +1394,14 @@ void llvm::ModifiedPostOrder::compute(const CycleInfoT &CI) { computeStackPO(Stack, CI, nullptr, Finalized); } +template +void GenericUniformityAnalysisImpl::recordUseOutsideCycle( + ConstValueRefT Src, const InstructionT *UserInstr, const CycleT &DefCycle) { + SmallVector TmpExitBlocks; + DefCycle.getExitBlocks(TmpExitBlocks); + UsesOutsideCycle.push_back({Src, UserInstr, TmpExitBlocks}); +} + } // namespace llvm #undef DEBUG_TYPE diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index e53afccc020b46..dd91f9b34b8300 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -39,6 +39,9 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + using UseOutsideCycleInfoT = + typename std::tuple>; GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI = nullptr); @@ -78,6 +81,8 @@ template class GenericUniformityInfo { void print(raw_ostream &Out) const; + iterator_range uses_outside_cycle() const; + private: using ImplT = GenericUniformityAnalysisImpl; diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp index 2d617db431c588..9679d884a54f5b 100644 --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -80,12 +80,16 @@ template <> void llvm::GenericUniformityAnalysisImpl< SSAContext>::propagateTemporalDivergence(const Instruction &I, const Cycle &DefCycle) { - if (isDivergent(I)) - return; for (auto *User : I.users()) { auto *UserInstr = cast(User); if (DefCycle.contains(UserInstr->getParent())) continue; + + recordUseOutsideCycle(cast(&I), UserInstr, DefCycle); + + if (isDivergent(I)) + continue; + markDivergent(*UserInstr); } } diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index b4cbb93d758ef2..ea056f6c80361b 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -288,7 +288,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB, if (!Reg) continue; if (MO.isUse()) { - if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg)) + if (Reg.isPhysical() && + (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg)))) continue; if (PI->modifiesRegister(Reg, TRI)) return true; @@ -1006,24 +1007,16 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB, if (MBB == SuccToSinkTo) return nullptr; - if (!SuccToSinkTo) - return nullptr; - // It's not safe to sink instructions to EH landing pad. Control flow into // landing pad is implicitly defined. - if (SuccToSinkTo->isEHPad()) + if (SuccToSinkTo && SuccToSinkTo->isEHPad()) return nullptr; // It ought to be okay to sink instructions into an INLINEASM_BR target, but // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in // the source block (which this code does not yet do). So for now, forbid // doing so. - if (SuccToSinkTo->isInlineAsmBrIndirectTarget()) - return nullptr; - - MachineBasicBlock::const_iterator InsertPos = - SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin()); - if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI)) + if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget()) return nullptr; return SuccToSinkTo; diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp index 3e0fe2b1ba087f..0d277aaf29cf71 100644 --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -117,11 +117,16 @@ void llvm::GenericUniformityAnalysisImpl:: if (!Op.getReg().isVirtual()) continue; auto Reg = Op.getReg(); - if (isDivergent(Reg)) - continue; + for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) { if (DefCycle.contains(UserInstr.getParent())) continue; + + recordUseOutsideCycle(Reg, &UserInstr, DefCycle); + + if (isDivergent(Reg)) + continue; + markDivergent(UserInstr); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index b7101f40115470..be1b263bebf874 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -35,6 +35,7 @@ FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createAMDGPUTemporalDivergenceLoweringPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -151,6 +152,9 @@ extern char &SILowerWWMCopiesID; void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeAMDGPUTemporalDivergenceLoweringPass(PassRegistry &); +extern char &AMDGPUTemporalDivergenceLoweringID; + void initializeSILowerSGPRSpillsPass(PassRegistry &); extern char &SILowerSGPRSpillsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 481fbaf1543a4e..a413918ab2c8d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -358,6 +358,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeAMDGPUTemporalDivergenceLoweringPass(*PR); initializeSILowerWWMCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); @@ -1240,6 +1241,8 @@ bool GCNPassConfig::addGlobalInstructionSelect() { } void GCNPassConfig::addPreRegAlloc() { + addPass(createAMDGPUTemporalDivergenceLoweringPass()); + if (LateCFGStructurize) { addPass(createAMDGPUMachineCFGStructurizerPass()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp new file mode 100644 index 00000000000000..a53d3b269ece87 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp @@ -0,0 +1,121 @@ +//===- AMDGPUTemporalDivergenceLowering.cpp -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "temporal-divergence-lowering" + +using namespace llvm; + +namespace { + +class AMDGPUTemporalDivergenceLowering : public MachineFunctionPass { +public: + static char ID; + +public: + AMDGPUTemporalDivergenceLowering() : MachineFunctionPass(ID) { + initializeAMDGPUTemporalDivergenceLoweringPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "Temporal divergence lowering"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE, + "Temporal divergence lowering", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE, + "Temporal divergence lowering", false, false) + +char AMDGPUTemporalDivergenceLowering::ID = 0; + +char &llvm::AMDGPUTemporalDivergenceLoweringID = + AMDGPUTemporalDivergenceLowering::ID; + +FunctionPass *llvm::createAMDGPUTemporalDivergenceLoweringPass() { + return new AMDGPUTemporalDivergenceLowering(); +} + +static void replaceUseRegisterWith(const MachineInstr *MI, Register Reg, + Register Newreg) { + for (unsigned i = 0; i < MI->getNumOperands(); ++i) { + const MachineOperand &Op = MI->getOperand(i); + if (Op.isReg() && Op.getReg() == Reg) { + const_cast(MI)->getOperand(i).setReg(Newreg); + } + } +} +// Get poiners to build instruction just after MI (skips phis if needed) +static std::pair +getInsertAfterPtrs(MachineInstr *MI) { + MachineBasicBlock *InsertMBB = MI->getParent(); + return std::make_pair( + InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator()))); +} + +bool AMDGPUTemporalDivergenceLowering::runOnMachineFunction( + MachineFunction &MF) { + + MachineCycleInfo &CycleInfo = + getAnalysis().getCycleInfo(); + MachineDominatorTree &DomTree = getAnalysis(); + + MachineUniformityInfo MUI = + computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), true); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + + // Temporal divergence lowering is required for uniform UniformSourceReg + // and divergent UserInstr. UserInstr is uniform only when loop is uniform. + for (auto [SrcReg, UserInstr, CycleExitBlocks] : MUI.uses_outside_cycle()) { + if (!MUI.isUniform(SrcReg) || !MUI.isDivergent(UserInstr)) + continue; + + MachineInstr *UniformSourceInstr = MRI.getVRegDef(SrcReg); + + // FixMe: SrcReg is lane mask in this case. Find a better way to detect it. + if (UniformSourceInstr->getOpcode() == AMDGPU::SI_IF_BREAK || + UserInstr->getOpcode() == AMDGPU::SI_IF) + continue; + + unsigned Size = TRI.getRegSizeInBits(*MRI.getRegClassOrNull(SrcReg)); + Register VgprDst = + MRI.createVirtualRegister(TRI.getVGPRClassForBitWidth(Size)); + + auto [MBB, AfterUniformSourceReg] = getInsertAfterPtrs(UniformSourceInstr); + BuildMI(*MBB, AfterUniformSourceReg, {}, TII.get(AMDGPU::COPY)) + .addDef(VgprDst) + .addReg(SrcReg) + .addReg(AMDGPU::EXEC, RegState::Implicit); + + replaceUseRegisterWith(UserInstr, SrcReg, VgprDst); + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 0922e8d99deb3a..3f838ae75edaeb 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -96,6 +96,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp AMDGPUTargetTransformInfo.cpp + AMDGPUTemporalDivergenceLowering.cpp AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 84f67b3faac3c0..91aac981c2c9e9 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -101,10 +101,12 @@ ; GCN-O0-NEXT: Finalize ISel and expand pseudo-instructions ; GCN-O0-NEXT: Local Stack Slot Allocation ; GCN-O0-NEXT: Register Usage Information Propagation +; GCN-O0-NEXT: Machine Cycle Info Analysis +; GCN-O0-NEXT: MachineDominator Tree Construction +; GCN-O0-NEXT: Temporal divergence lowering ; GCN-O0-NEXT: Eliminate PHI nodes for register allocation ; GCN-O0-NEXT: SI Lower control flow pseudo instructions ; GCN-O0-NEXT: Two-Address instruction pass -; GCN-O0-NEXT: MachineDominator Tree Construction ; GCN-O0-NEXT: Slot index numbering ; GCN-O0-NEXT: Live Interval Analysis ; GCN-O0-NEXT: MachinePostDominator Tree Construction @@ -322,12 +324,13 @@ ; GCN-O1-NEXT: Remove dead machine instructions ; GCN-O1-NEXT: SI Shrink Instructions ; GCN-O1-NEXT: Register Usage Information Propagation +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Temporal divergence lowering ; GCN-O1-NEXT: Detect Dead Lanes ; GCN-O1-NEXT: Remove dead machine instructions ; GCN-O1-NEXT: Process Implicit Definitions ; GCN-O1-NEXT: Remove unreachable machine basic blocks ; GCN-O1-NEXT: Live Variable Analysis -; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Optimize VGPR LiveRange ; GCN-O1-NEXT: Eliminate PHI nodes for register allocation ; GCN-O1-NEXT: SI Lower control flow pseudo instructions @@ -616,6 +619,8 @@ ; GCN-O1-OPTS-NEXT: Remove dead machine instructions ; GCN-O1-OPTS-NEXT: SI Shrink Instructions ; GCN-O1-OPTS-NEXT: Register Usage Information Propagation +; GCN-O1-OPTS-NEXT: Machine Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Temporal divergence lowering ; GCN-O1-OPTS-NEXT: Detect Dead Lanes ; GCN-O1-OPTS-NEXT: Remove dead machine instructions ; GCN-O1-OPTS-NEXT: Process Implicit Definitions @@ -919,6 +924,8 @@ ; GCN-O2-NEXT: Remove dead machine instructions ; GCN-O2-NEXT: SI Shrink Instructions ; GCN-O2-NEXT: Register Usage Information Propagation +; GCN-O2-NEXT: Machine Cycle Info Analysis +; GCN-O2-NEXT: Temporal divergence lowering ; GCN-O2-NEXT: Detect Dead Lanes ; GCN-O2-NEXT: Remove dead machine instructions ; GCN-O2-NEXT: Process Implicit Definitions @@ -1235,6 +1242,8 @@ ; GCN-O3-NEXT: Remove dead machine instructions ; GCN-O3-NEXT: SI Shrink Instructions ; GCN-O3-NEXT: Register Usage Information Propagation +; GCN-O3-NEXT: Machine Cycle Info Analysis +; GCN-O3-NEXT: Temporal divergence lowering ; GCN-O3-NEXT: Detect Dead Lanes ; GCN-O3-NEXT: Remove dead machine instructions ; GCN-O3-NEXT: Process Implicit Definitions diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index e2456b74f7ef1f..b8e74bc7db09a1 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -21,6 +21,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4 ; CHECK-NEXT: .LBB0_2: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 @@ -53,7 +54,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2 ; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0 -; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4 ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execz .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir index cc14b4a80d58a7..037a285794120d 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir @@ -42,7 +42,6 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 @@ -52,6 +51,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4 ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]] ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll new file mode 100644 index 00000000000000..4415d7d4a7614d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -0,0 +1,792 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s + +; ModuleID = 'kernel_round1_passing.bc' +source_filename = "/tmp/comgr-295d04/input/CompileSource" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn-amd-amdhsa" + +@kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1 +@kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4 +@kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4 + +; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) +declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0 + +; Function Attrs: convergent nounwind +declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1 + +; Function Attrs: convergent nounwind +declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1 + +; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) +declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0 + +; Function Attrs: convergent nounwind +declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1 + +; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) +declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0 + +; Function Attrs: convergent nounwind +declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1 + +; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) +declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0 + +; Function Attrs: convergent norecurse nounwind +define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 { +; CHECK-LABEL: kernel_round1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: v_mov_b32_e32 v41, v0 +; CHECK-NEXT: s_add_u32 s42, s34, 40 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_addc_u32 s43, s35, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b32 s33, s14 +; CHECK-NEXT: s_mov_b32 s40, s13 +; CHECK-NEXT: s_mov_b32 s41, s12 +; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v45, 0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: v_mov_b32_e32 v43, v0 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: v_mov_b32_e32 v40, v0 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 +; CHECK-NEXT: s_getpc_b64 s[52:53] +; CHECK-NEXT: s_add_u32 s52, s52, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s53, s53, _Z7barrierj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53] +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: global_load_dword v0, v0, s[48:49] +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 +; CHECK-NEXT: v_mov_b32_e32 v1, 12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: v_mov_b32_e32 v42, v0 +; CHECK-NEXT: s_mov_b32 s48, exec_lo +; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 +; CHECK-NEXT: s_cbranch_execz .LBB0_25 +; CHECK-NEXT: ; %bb.1: ; %.preheader5 +; CHECK-NEXT: v_mul_lo_u32 v0, v40, 14 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0 +; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44 +; CHECK-NEXT: s_add_i32 s5, s5, 1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42 +; CHECK-NEXT: ds_write_b8 v1, v45 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB0_2 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 +; CHECK-NEXT: s_mov_b32 s49, 0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 +; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB0_25 +; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 +; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 +; CHECK-NEXT: v_mov_b32_e32 v47, 0 +; CHECK-NEXT: s_getpc_b64 s[42:43] +; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 +; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44 +; CHECK-NEXT: s_lshl_b32 s4, s55, 5 +; CHECK-NEXT: s_add_i32 s54, s55, 1 +; CHECK-NEXT: s_add_i32 s5, s55, 5 +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_read_u8 v56, v0 +; CHECK-NEXT: v_mov_b32_e32 v58, s54 +; CHECK-NEXT: s_mov_b32 s56, exec_lo +; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 +; CHECK-NEXT: s_cbranch_execz .LBB0_17 +; CHECK-NEXT: ; %bb.6: ; %.preheader2 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_mov_b32 s57, 0 +; CHECK-NEXT: s_mov_b32 s58, 0 +; CHECK-NEXT: s_branch .LBB0_8 +; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59 +; CHECK-NEXT: s_add_i32 s58, s58, 4 +; CHECK-NEXT: s_add_i32 s4, s55, s58 +; CHECK-NEXT: v_mov_b32_e32 v0, s58 +; CHECK-NEXT: s_add_i32 s5, s4, 5 +; CHECK-NEXT: s_add_i32 s4, s4, 1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 +; CHECK-NEXT: v_mov_b32_e32 v58, s4 +; CHECK-NEXT: s_or_b32 s57, vcc_lo, s57 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_cbranch_execz .LBB0_16 +; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 +; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: v_add_nc_u32_e32 v59, s58, v46 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v60, 0xff, v56 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s58, v57 +; CHECK-NEXT: s_mov_b32 s59, exec_lo +; CHECK-NEXT: ds_read_u8 v0, v59 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cmpx_eq_u16_e64 v60, v0 +; CHECK-NEXT: s_cbranch_execz .LBB0_10 +; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43] +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: ds_write_b32 v0, v58 +; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59 +; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 +; CHECK-NEXT: s_mov_b32 s59, exec_lo +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cmpx_eq_u16_e64 v60, v0 +; CHECK-NEXT: s_cbranch_execz .LBB0_12 +; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_add_nc_u32_e32 v61, 1, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43] +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: ds_write_b32 v0, v61 +; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59 +; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 +; CHECK-NEXT: s_mov_b32 s59, exec_lo +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cmpx_eq_u16_e64 v60, v0 +; CHECK-NEXT: s_cbranch_execz .LBB0_14 +; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_add_nc_u32_e32 v61, 2, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43] +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: ds_write_b32 v0, v61 +; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59 +; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 +; CHECK-NEXT: s_mov_b32 s59, exec_lo +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cmpx_eq_u16_e64 v60, v0 +; CHECK-NEXT: s_cbranch_execz .LBB0_7 +; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43] +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: ds_write_b32 v0, v58 +; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: .LBB0_16: ; %Flow43 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: v_add_nc_u32_e32 v57, v0, v57 +; CHECK-NEXT: .LBB0_17: ; %Flow44 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 +; CHECK-NEXT: s_cbranch_execz .LBB0_23 +; CHECK-NEXT: ; %bb.18: ; %.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_mov_b32 s56, 0 +; CHECK-NEXT: s_inst_prefetch 0x1 +; CHECK-NEXT: s_branch .LBB0_20 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 +; CHECK-NEXT: s_or_b32 s56, vcc_lo, s56 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: s_cbranch_execz .LBB0_22 +; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 +; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 +; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s57, s4 +; CHECK-NEXT: s_cbranch_execz .LBB0_19 +; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43] +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: ds_write_b32 v0, v57 +; CHECK-NEXT: s_branch .LBB0_19 +; CHECK-NEXT: .LBB0_22: ; %Flow41 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_23: ; %Flow42 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 +; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 +; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 +; CHECK-NEXT: s_mov_b32 s55, s54 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 +; CHECK-NEXT: s_or_b32 s49, s4, s49 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49 +; CHECK-NEXT: s_cbranch_execnz .LBB0_5 +; CHECK-NEXT: .LBB0_25: ; %Flow49 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53] +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v40 +; CHECK-NEXT: s_cbranch_execz .LBB0_33 +; CHECK-NEXT: ; %bb.26: +; CHECK-NEXT: s_add_u32 s52, s44, 8 +; CHECK-NEXT: s_addc_u32 s53, s45, 0 +; CHECK-NEXT: s_getpc_b64 s[42:43] +; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_addPU3AS1Vjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_addPU3AS1Vjj@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s54, 0 +; CHECK-NEXT: s_getpc_b64 s[44:45] +; CHECK-NEXT: s_add_u32 s44, s44, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s45, s45, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[48:49] +; CHECK-NEXT: s_add_u32 s48, s48, _Z14get_local_sizej@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s49, s49, _Z14get_local_sizej@rel32@hi+12 +; CHECK-NEXT: s_branch .LBB0_28 +; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[48:49] +; CHECK-NEXT: v_add_co_u32 v40, vcc_lo, v0, v40 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v40 +; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54 +; CHECK-NEXT: s_cbranch_execz .LBB0_33 +; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v40 +; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0 +; CHECK-NEXT: v_bfe_u32 v62, v0, 5, 5 +; CHECK-NEXT: v_and_b32_e32 v72, 31, v0 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62 +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72 +; CHECK-NEXT: v_add_co_u32 v2, s4, s52, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s53, 0, s4 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5 +; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4 +; CHECK-NEXT: v_xor_b32_e32 v57, v11, v7 +; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6 +; CHECK-NEXT: v_or_b32_e32 v5, v46, v57 +; CHECK-NEXT: v_or_b32_e32 v4, v45, v56 +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB0_27 +; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45 +; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12 +; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: v_and_b32_e32 v3, 0xf000, v0 +; CHECK-NEXT: v_and_b32_e32 v4, 0xf00, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 15, v1 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4 +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 +; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1 +; CHECK-NEXT: v_and_b32_e32 v74, 28, v1 +; CHECK-NEXT: v_add_co_u32 v42, s4, s50, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s51, 0, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, v44 +; CHECK-NEXT: v_mov_b32_e32 v0, v42 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: v_mov_b32_e32 v1, v43 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43] +; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0 +; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB0_31 +; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: v_xor_b32_e32 v5, v60, v58 +; CHECK-NEXT: v_lshrrev_b64 v[3:4], 16, v[56:57] +; CHECK-NEXT: v_mul_u32_u24_e32 v11, 0x180, v73 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; CHECK-NEXT: v_lshrrev_b64 v[1:2], 16, v[45:46] +; CHECK-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 +; CHECK-NEXT: v_lshlrev_b32_e32 v10, 12, v63 +; CHECK-NEXT: v_xor_b32_e32 v6, v61, v59 +; CHECK-NEXT: v_lshlrev_b32_e32 v9, 16, v56 +; CHECK-NEXT: v_or_b32_e32 v4, v7, v4 +; CHECK-NEXT: v_add_co_u32 v7, s5, s46, v11 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s47, 0, s5 +; CHECK-NEXT: v_or3_b32 v10, v8, v10, v62 +; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, v0 +; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v11, vcc_lo +; CHECK-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6] +; CHECK-NEXT: v_or_b32_e32 v2, v9, v2 +; CHECK-NEXT: global_store_dword v[7:8], v10, off offset:4 +; CHECK-NEXT: global_store_dwordx4 v[7:8], v[1:4], off offset:8 +; CHECK-NEXT: global_store_dwordx2 v[7:8], v[5:6], off offset:24 +; CHECK-NEXT: ; implicit-def: $vgpr42 +; CHECK-NEXT: ; implicit-def: $vgpr43 +; CHECK-NEXT: ; implicit-def: $vgpr44 +; CHECK-NEXT: .LBB0_31: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4 +; CHECK-NEXT: s_cbranch_execz .LBB0_27 +; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v0, v42 +; CHECK-NEXT: v_mov_b32_e32 v1, v43 +; CHECK-NEXT: v_mov_b32_e32 v2, v44 +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s41 +; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45] +; CHECK-NEXT: s_branch .LBB0_27 +; CHECK-NEXT: .LBB0_33: +; CHECK-NEXT: s_endpgm + %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4 + %7 = trunc i64 %6 to i32 + %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4 + %9 = trunc i64 %8 to i32 + %10 = mul i32 %9, 14 + %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10 + store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11 + tail call void @_Z7barrierj(i32 noundef 1) #5 + %12 = lshr i64 %6, 3 + %13 = shl i32 %7, 2 + %14 = and i32 %13, 28 + %15 = and i64 %12, 536870911 + %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15 + %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11 + %18 = lshr i32 %17, %14 + %19 = and i32 %18, 15 + %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4 + %21 = icmp eq i32 %20, 0 + br i1 %21, label %119, label %27 + +22: ; preds = %27 + %23 = add i32 %20, -1 + %24 = icmp eq i32 %23, 0 + br i1 %24, label %119, label %25 + +25: ; preds = %22 + %26 = shl i32 %7, 10 + br label %37 + +27: ; preds = %5, %27 + %28 = phi i32 [ %30, %27 ], [ 0, %5 ] + %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28 + store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15 + %30 = add nuw i32 %28, 1 + %31 = icmp eq i32 %30, %20 + br i1 %31, label %22, label %27 + +32: ; preds = %114, %48 + %33 = phi i32 [ %50, %48 ], [ %115, %114 ] + %34 = icmp ult i32 %44, %23 + %35 = icmp ult i32 %33, 60 + %36 = select i1 %34, i1 %35, i1 false + br i1 %36, label %37, label %119 + +37: ; preds = %32, %25 + %38 = phi i32 [ 0, %25 ], [ %44, %32 ] + %39 = phi i32 [ 0, %25 ], [ %33, %32 ] + %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38 + %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15 + %42 = shl i32 %38, 5 + %43 = or i32 %42, %26 + %44 = add nuw i32 %38, 1 + %45 = or i32 %43, %44 + %46 = add i32 %38, 5 + %47 = icmp ult i32 %46, %20 + br i1 %47, label %53, label %48 + +48: ; preds = %98, %37 + %49 = phi i32 [ %45, %37 ], [ %100, %98 ] + %50 = phi i32 [ %39, %37 ], [ %99, %98 ] + %51 = phi i32 [ %44, %37 ], [ %54, %98 ] + %52 = icmp ult i32 %51, %20 + br i1 %52, label %103, label %32 + +53: ; preds = %37, %98 + %54 = phi i32 [ %101, %98 ], [ %46, %37 ] + %55 = phi i32 [ %54, %98 ], [ %44, %37 ] + %56 = phi i32 [ %99, %98 ], [ %39, %37 ] + %57 = phi i32 [ %100, %98 ], [ %45, %37 ] + %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55 + %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15 + %60 = icmp eq i8 %41, %59 + br i1 %60, label %61, label %65 + +61: ; preds = %53 + %62 = add i32 %56, 1 + %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 + %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63 + store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11 + br label %65 + +65: ; preds = %61, %53 + %66 = phi i32 [ %62, %61 ], [ %56, %53 ] + %67 = add i32 %55, 1 + %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67 + %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15 + %70 = icmp eq i8 %41, %69 + br i1 %70, label %71, label %76 + +71: ; preds = %65 + %72 = add i32 %57, 1 + %73 = add i32 %66, 1 + %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 + %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74 + store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11 + br label %76 + +76: ; preds = %71, %65 + %77 = phi i32 [ %73, %71 ], [ %66, %65 ] + %78 = add i32 %55, 2 + %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78 + %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15 + %81 = icmp eq i8 %41, %80 + br i1 %81, label %82, label %87 + +82: ; preds = %76 + %83 = add i32 %57, 2 + %84 = add i32 %77, 1 + %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 + %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85 + store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11 + br label %87 + +87: ; preds = %82, %76 + %88 = phi i32 [ %84, %82 ], [ %77, %76 ] + %89 = add i32 %55, 3 + %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89 + %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15 + %92 = icmp eq i8 %41, %91 + br i1 %92, label %93, label %98 + +93: ; preds = %87 + %94 = add i32 %57, 3 + %95 = add i32 %88, 1 + %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 + %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96 + store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11 + br label %98 + +98: ; preds = %93, %87 + %99 = phi i32 [ %95, %93 ], [ %88, %87 ] + %100 = add i32 %57, 4 + %101 = add i32 %54, 4 + %102 = icmp ult i32 %101, %20 + br i1 %102, label %53, label %48 + +103: ; preds = %48, %114 + %104 = phi i32 [ %117, %114 ], [ %51, %48 ] + %105 = phi i32 [ %115, %114 ], [ %50, %48 ] + %106 = phi i32 [ %116, %114 ], [ %49, %48 ] + %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104 + %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15 + %109 = icmp eq i8 %41, %108 + br i1 %109, label %110, label %114 + +110: ; preds = %103 + %111 = add i32 %105, 1 + %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 + %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112 + store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11 + br label %114 + +114: ; preds = %110, %103 + %115 = phi i32 [ %111, %110 ], [ %105, %103 ] + %116 = add i32 %106, 1 + %117 = add nuw i32 %104, 1 + %118 = icmp ult i32 %117, %20 + br i1 %118, label %103, label %32 + +119: ; preds = %32, %22, %5 + tail call void @_Z7barrierj(i32 noundef 1) #5 + %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11 + %121 = icmp ugt i32 %120, %9 + br i1 %121, label %122, label %206 + +122: ; preds = %119 + %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8 + br label %124 + +124: ; preds = %201, %122 + %125 = phi i32 [ %9, %122 ], [ %204, %201 ] + %126 = phi i64 [ %8, %122 ], [ %203, %201 ] + %127 = and i64 %126, 4294967295 + %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125 + %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11 + %130 = lshr i32 %129, 10 + %131 = lshr i32 %129, 5 + %132 = and i32 %131, 31 + %133 = and i32 %129, 31 + %134 = mul nuw nsw i32 %130, 384 + %135 = zext i32 %134 to i64 + %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135 + %137 = shl nuw nsw i32 %132, 5 + %138 = zext i32 %137 to i64 + %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138 + %140 = shl nuw nsw i32 %133, 5 + %141 = zext i32 %140 to i64 + %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141 + %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1 + %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16 + %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1 + %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16 + %147 = xor i64 %146, %144 + %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16 + %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16 + %150 = xor i64 %149, %148 + %151 = icmp ne i64 %147, 0 + %152 = icmp ne i64 %150, 0 + %153 = select i1 %151, i1 true, i1 %152 + br i1 %153, label %154, label %201 + +154: ; preds = %124 + %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2 + %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16 + %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2 + %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16 + %159 = and i64 %147, 983040 + %160 = shl i64 %147, 4 + %161 = and i64 %160, 61440 + %162 = or i64 %161, %159 + %163 = lshr i64 %147, 12 + %164 = and i64 %163, 3840 + %165 = or i64 %162, %164 + %166 = and i64 %160, 240 + %167 = or i64 %165, %166 + %168 = and i64 %163, 15 + %169 = or i64 %167, %168 + %170 = trunc i64 %169 to i32 + %171 = lshr i64 %169, 3 + %172 = shl nuw nsw i32 %170, 2 + %173 = and i32 %172, 28 + %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171 + %175 = shl nuw nsw i32 1, %173 + %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5 + %177 = lshr i32 %176, %173 + %178 = and i32 %177, 15 + %179 = icmp ugt i32 %178, 11 + br i1 %179, label %180, label %182 + +180: ; preds = %154 + %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5 + br label %201 + +182: ; preds = %154 + %183 = xor i64 %158, %156 + %184 = lshr i64 %183, 16 + %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48) + %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48) + %187 = shl nuw nsw i32 %133, 6 + %188 = shl i32 %130, 12 + %189 = or i32 %187, %188 + %190 = or i32 %189, %132 + %191 = mul nuw nsw i64 %169, 384 + %192 = and i64 %191, 4294967168 + %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192 + %194 = shl nuw nsw i32 %178, 5 + %195 = or i32 %194, 8 + %196 = zext i32 %195 to i64 + %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196 + %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4 + store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11 + store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16 + %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8 + store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16 + %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16 + store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16 + br label %201 + +201: ; preds = %182, %180, %124 + %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4 + %203 = add i64 %202, %127 + %204 = trunc i64 %203 to i32 + %205 = icmp ugt i32 %120, %204 + br i1 %205, label %124, label %206 + +206: ; preds = %201, %119 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.fshl.i64(i64, i64, i64) #3 + +attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #4 = { convergent nounwind willreturn memory(none) } +attributes #5 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!opencl.ocl.version = !{!3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 1, i32 2} +!4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"} +!5 = !{i32 1, i32 1, i32 1, i32 1, i32 1} +!6 = !{!"none", !"none", !"none", !"none", !"none"} +!7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"} +!8 = !{!"", !"", !"", !"", !""} +!9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"} +!10 = !{i32 64, i32 1, i32 1} +!11 = !{!12, !12, i64 0} +!12 = !{!"int", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} +!15 = !{!13, !13, i64 0} +!16 = !{!17, !17, i64 0} +!17 = !{!"long", !13, i64 0} diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir index ee3d7aeb454f96..4feef2149b4224 100644 --- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir +++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir @@ -17,7 +17,6 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 - ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 ; GFX10-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec @@ -38,6 +37,7 @@ body: | ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc + ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec ; GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31 ; GFX10-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec ; GFX10-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc