From 30a3adf50e2d49dfc97c1b614d9b93638eba672d Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Wed, 30 Aug 2023 12:05:20 -0400 Subject: [PATCH] [AMDGPU] Reorder atomic optimizer to avoid CAS loop. Expand-Atomic pass emits the CAS loop for FP operations which limits the optimizations offered by atomic optimizer. Moving atomic optimizer before expand-atomics allows better codegen. Reviewed By: arsenm, #amdgpu Differential Revision: https://reviews.llvm.org/D157265 Change-Id: I68744786339644060bca4199c041f2020d9b9425 --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 12 +- .../global-atomic-fadd.f32-no-rtn.ll | 25 +- .../GlobalISel/global-atomic-fadd.f32-rtn.ll | 6 +- .../AMDGPU/atomic-optimizer-strict-wqm.ll | 7 +- .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 25 +- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 6 +- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 1053 ++-- .../AMDGPU/global_atomics_scan_fadd.ll | 3995 ++++++++++---- .../AMDGPU/global_atomics_scan_fsub.ll | 4749 ++++++++++++----- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 28 +- llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 534 +- 11 files changed, 7383 insertions(+), 3057 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0f3fcb6535c283..44193c4945b236 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1027,6 +1027,13 @@ void AMDGPUPassConfig::addIRPasses() { if (TM.getOptLevel() > CodeGenOpt::None) addPass(createInferAddressSpacesPass()); + // Run atomic optimizer before Atomic Expand + if ((TM.getTargetTriple().getArch() == Triple::amdgcn) && + (TM.getOptLevel() >= CodeGenOpt::Less) && + (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { + addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); + } + addPass(createAtomicExpandPass()); if (TM.getOptLevel() > CodeGenOpt::None) { @@ -1153,11 +1160,6 @@ bool GCNPassConfig::addPreISel() { if (TM->getOptLevel() > CodeGenOpt::None) addPass(createAMDGPULateCodeGenPreparePass()); - if ((TM->getOptLevel() >= CodeGenOpt::Less) && - (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { - addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); - } - if (TM->getOptLevel() > CodeGenOpt::None) addPass(createSinkingPass()); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 5bb6bac84b842a..c83edff14a9d42 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -141,7 +141,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[PRED_COPY:%[0-9]+]]:sreg_32 = PRED_COPY $sgpr0 @@ -149,11 +149,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[PRED_COPY]], %subreg.sub0, [[PRED_COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[PRED_COPY2:%[0-9]+]]:vgpr_32 = PRED_COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE - ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5): - ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[PRED_COPY3:%[0-9]+]]:sreg_64 = PRED_COPY $exec ; GFX90A_GFX940-NEXT: [[PRED_COPY4:%[0-9]+]]:sreg_32 = PRED_COPY [[PRED_COPY3]].sub0 @@ -196,29 +196,22 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[PRED_COPY18]], implicit $exec ; GFX90A_GFX940-NEXT: [[PRED_COPY19:%[0-9]+]]:vgpr_32 = PRED_COPY [[S_MOV_B32_1]] ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[PRED_COPY19]], implicit $exec - ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.36): - ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) + ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35): + ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.5 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.4.Flow: - ; GFX90A_GFX940-NEXT: successors: %bb.6(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.6 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.38): - ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.4 ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.6 (%ir-block.39): + ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37): + ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index 327c7c14da101f..9b8ac49673ff50 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -206,7 +206,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.3 (%ir-block.39): + ; GFX11-NEXT: bb.3 (%ir-block.36): ; GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -220,7 +220,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.6 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.5 (%ir-block.42): + ; GFX11-NEXT: bb.5 (%ir-block.39): ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 @@ -231,7 +231,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PRED_COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.6 (%ir-block.50): + ; GFX11-NEXT: bb.6 (%ir-block.47): ; GFX11-NEXT: $vgpr0 = PRED_COPY [[PHI]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index 611f60562a8156..8d4d04f9f3cca0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -14,16 +14,15 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_branch .LBB0_2 -; GFX10-NEXT: .LBB0_1: ; %Flow -; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s2, s0, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB0_5 ; GFX10-NEXT: .LBB0_2: ; %bb4 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s2, s0, s2 ; GFX10-NEXT: s_and_saveexec_b32 s3, s1 ; GFX10-NEXT: s_cbranch_execz .LBB0_1 ; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index 79da505671bb7f..d954ac00c4c2dc 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -147,7 +147,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) + ; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[PRED_COPY:%[0-9]+]]:vgpr_32 = PRED_COPY $vgpr0 @@ -156,11 +156,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[PRED_COPY2]], %subreg.sub0, [[PRED_COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[PRED_COPY3:%[0-9]+]]:sreg_64 = PRED_COPY [[REG_SEQUENCE]] ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_BRANCH %bb.1 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5): - ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[PRED_COPY4:%[0-9]+]]:sreg_64 = PRED_COPY $exec ; GFX90A_GFX940-NEXT: [[PRED_COPY5:%[0-9]+]]:sreg_32 = PRED_COPY [[PRED_COPY4]].sub1 @@ -188,30 +188,23 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]] ; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.36): - ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) + ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.35): + ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A_GFX940-NEXT: [[PRED_COPY8:%[0-9]+]]:vgpr_32 = PRED_COPY %1 ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[PRED_COPY8]], [[PRED_COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.4 ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.3.Flow: - ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.5 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.38): - ; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000) + ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.39): + ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37): + ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 2b63ec052d758d..6eeb44de952667 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -203,7 +203,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.2 (%ir-block.39): + ; GFX11-NEXT: bb.2 (%ir-block.36): ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -218,7 +218,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.5 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.4 (%ir-block.42): + ; GFX11-NEXT: bb.4 (%ir-block.39): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 @@ -228,7 +228,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.5 (%ir-block.50): + ; GFX11-NEXT: bb.5 (%ir-block.47): ; GFX11-NEXT: $vgpr0 = PRED_COPY [[PHI]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index debcc67b29a5d5..0499f83de1c92a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -8,138 +8,216 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[4:5], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1 +; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX900-NEXT: s_cbranch_execz .LBB0_4 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB0_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB0_2 +; GFX900-NEXT: ; %bb.3: ; %Flow +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: .LBB0_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: ; implicit-def: $vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_cbranch_execz .LBB0_4 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB0_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB0_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: v_readfirstlane_b32 s0, v1 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB0_4 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB0_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB0_4: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_ret_f32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB0_4 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_cbranch_execnz .LBB0_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB0_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, exec_lo +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB0_4 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_execnz .LBB0_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: .LBB0_4: ; %Flow1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -151,54 +229,88 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr) #2 { ; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[4:5], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1 +; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX900-NEXT: s_cbranch_execz .LBB1_4 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB1_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB1_2 +; GFX900-NEXT: ; %bb.3: ; %Flow +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: .LBB1_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX900-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: ; implicit-def: $vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_cbranch_execz .LBB1_4 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB1_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB1_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX908-NEXT: v_readfirstlane_b32 s0, v1 +; GFX908-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX908-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: @@ -231,30 +343,46 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB1_4 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_cbranch_execnz .LBB1_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB1_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX10-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: @@ -294,26 +422,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_noret_f32: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX900-NEXT: s_cbranch_execz .LBB2_3 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX900-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB2_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_cbranch_execnz .LBB2_2 +; GFX900-NEXT: .LBB2_3: ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_noret_f32: @@ -360,19 +498,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; ; GFX10-LABEL: global_atomic_fadd_noret_f32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -380,8 +527,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_cbranch_execnz .LBB2_2 +; GFX10-NEXT: .LBB2_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32: @@ -411,26 +558,36 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %ptr) #2 { ; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX900-NEXT: s_cbranch_execz .LBB3_3 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX900-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB3_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_cbranch_execnz .LBB3_2 +; GFX900-NEXT: .LBB3_3: ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: @@ -477,19 +634,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -497,8 +663,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_cbranch_execnz .LBB3_2 +; GFX10-NEXT: .LBB3_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: @@ -528,54 +694,86 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[4:5], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1 +; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX900-NEXT: s_cbranch_execz .LBB4_4 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB4_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB4_2 +; GFX900-NEXT: ; %bb.3: ; %Flow +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: .LBB4_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: ; implicit-def: $vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_cbranch_execz .LBB4_4 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB4_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB4_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB4_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: v_readfirstlane_b32 s0, v1 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: @@ -607,30 +805,45 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB4_4 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_cbranch_execnz .LBB4_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB4_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: @@ -670,138 +883,216 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32_system: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[4:5], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1 +; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX900-NEXT: s_cbranch_execz .LBB5_4 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB5_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB5_2 +; GFX900-NEXT: ; %bb.3: ; %Flow +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: .LBB5_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_system: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: ; implicit-def: $vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_cbranch_execz .LBB5_4 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB5_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: v_readfirstlane_b32 s0, v1 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_system: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB5_4 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB5_4: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_ret_f32_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: s_cbranch_execnz .LBB5_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB5_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_system: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, exec_lo +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_execnz .LBB5_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: .LBB5_4: ; %Flow1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -813,54 +1104,86 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[2:3], 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: .LBB6_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_add_f32_e32 v4, v5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN-NEXT: s_cbranch_execnz .LBB6_1 -; GCN-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB6_2 +; GCN-NEXT: ; %bb.3: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB6_4: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-NEXT: global_store_dword v[0:1], v1, off +; GCN-NEXT: v_readfirstlane_b32 s0, v1 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GCN-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX11-NEXT: s_mov_b64 s[4:5], 0 +; GFX11-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_wbinvl1_vol -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX11-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX11-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX11-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX11-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX11-NEXT: s_cbranch_execnz .LBB6_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11-NEXT: .LBB6_4: ; %Flow1 ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11-NEXT: global_store_dword v[0:1], v1, off +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX11-NEXT: global_store_dword v[0:1], v0, off ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst store float %result, ptr addrspace(1) undef @@ -916,91 +1239,130 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %ptr) { ; GFX900-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX900-NEXT: s_cbranch_execz .LBB8_3 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX900-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB8_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_cbranch_execnz .LBB8_2 +; GFX900-NEXT: .LBB8_3: ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB8_3 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_cbranch_execnz .LBB8_2 +; GFX908-NEXT: .LBB8_3: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB8_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB8_2 +; GFX90A-NEXT: .LBB8_3: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB8_3 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1008,25 +1370,33 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_cbranch_execnz .LBB8_2 +; GFX10-NEXT: .LBB8_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: s_mov_b32 s4, exec_lo +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB8_3 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv @@ -1034,8 +1404,8 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_cbranch_execnz .LBB8_2 +; GFX11-NEXT: .LBB8_3: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -1044,26 +1414,35 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX900-LABEL: infer_as_before_atomic: ; GFX900: ; %bb.0: +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX900-NEXT: s_cbranch_execz .LBB9_3 +; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v2, s5 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX900-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v1 -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB9_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_cbranch_execnz .LBB9_2 +; GFX900-NEXT: .LBB9_3: ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: infer_as_before_atomic: @@ -1108,26 +1487,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB9_3 +; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v1 -; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_cbranch_execnz .LBB9_2 +; GFX10-NEXT: .LBB9_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: infer_as_before_atomic: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index e6b8992d62d89c..96c8688cdc19c5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -17,94 +17,133 @@ declare float @div.float.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -153,68 +192,97 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -323,10 +391,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -341,23 +409,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB1_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -367,42 +459,66 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -412,42 +528,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -559,10 +695,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -577,23 +713,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -603,42 +778,79 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -648,42 +860,73 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: @@ -827,256 +1070,474 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic ret void @@ -1142,10 +1603,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1160,23 +1621,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1186,42 +1671,66 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1231,115 +1740,187 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1350,10 +1931,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1368,23 +1949,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1394,42 +2014,79 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1439,115 +2096,231 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic @@ -1557,256 +2330,474 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic ret void @@ -1872,10 +2863,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1890,23 +2881,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -1916,42 +2931,66 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -1961,42 +3000,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -2108,10 +3167,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2126,23 +3185,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -2152,42 +3250,79 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -2197,42 +3332,73 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: @@ -2433,10 +3599,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2451,23 +3617,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB6_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB6_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2477,42 +3667,66 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB6_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2522,43 +3736,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB6_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1032-NEXT: s_endpgm +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1032-NEXT: .LBB6_5: +; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: @@ -2669,10 +3903,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2687,23 +3921,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2713,42 +3986,79 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2758,42 +4068,73 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2937,256 +4278,474 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-NEXT: .LBB7_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-NEXT: .LBB7_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-NEXT: .LBB7_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-NEXT: .LBB7_3: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-NEXT: .LBB7_3: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void @@ -3251,10 +4810,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3269,23 +4828,47 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB8_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB8_4 +; GFX9-NEXT: .LBB8_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: @@ -3295,42 +4878,66 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB8_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1064-NEXT: .LBB8_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: @@ -3340,115 +4947,187 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB8_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1032-NEXT: .LBB8_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB8_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1164-NEXT: .LBB8_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB8_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: @@ -3459,10 +5138,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3477,23 +5156,62 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: @@ -3503,42 +5221,79 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: @@ -3548,115 +5303,231 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 32b558e64d52bf..52bfc034575a0f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -17,256 +17,368 @@ declare float @div.float.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: .LBB0_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 ret void @@ -331,10 +443,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -349,23 +461,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB1_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -375,42 +511,66 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -420,115 +580,187 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: .LBB1_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -539,10 +771,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -557,23 +789,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -583,42 +845,69 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: @@ -628,115 +917,195 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: .LBB1_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: .LBB1_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 @@ -746,256 +1115,474 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3 +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic ret void @@ -1061,10 +1648,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1079,23 +1666,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB3_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1105,42 +1716,66 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1150,115 +1785,187 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1269,10 +1976,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1287,23 +1994,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1313,42 +2050,69 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: @@ -1358,115 +2122,195 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic @@ -1476,256 +2320,474 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3 +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic ret void @@ -1791,10 +2853,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -1809,23 +2871,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -1835,42 +2921,66 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -1880,115 +2990,187 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: .LBB5_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -1999,10 +3181,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2017,23 +3199,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -2043,42 +3255,69 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: @@ -2088,115 +3327,195 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: .LBB5_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: .LBB5_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic @@ -2263,10 +3582,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2281,23 +3600,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB6_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB6_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB6_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2307,42 +3650,66 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB6_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2352,115 +3719,187 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB6_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB6_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1164-NEXT: .LBB6_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB6_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2471,10 +3910,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -2489,23 +3928,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2515,42 +3984,69 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: @@ -2560,115 +4056,195 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic @@ -2678,256 +4254,474 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 -; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 -; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-NEXT: .LBB7_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v3 +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-NEXT: .LBB7_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-NEXT: .LBB7_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-NEXT: .LBB7_3: +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-NEXT: .LBB7_3: +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: v_add_f64 v[1:2], s[2:3], v[0:1] ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mul_f32_e32 v3, 4.0, v1 +; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v3 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], s[4:5], s[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: .LBB7_3: +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: ; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:12 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:8 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_mov_b32 s3, 0xc3300000 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: .LBB7_3: +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 ret void @@ -2992,10 +4786,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s9 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 44 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3010,23 +4804,47 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB8_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s2, s1 +; GFX9-NEXT: s_ff1_i32_b32 s3, s0 +; GFX9-NEXT: s_add_i32 s2, s2, 32 +; GFX9-NEXT: s_min_u32 s2, s3, s2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 +; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz .LBB8_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB8_4 +; GFX9-NEXT: .LBB8_5: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: @@ -3036,42 +4854,66 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-NEXT: s_mov_b32 s14, s8 -; GFX1064-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-NEXT: s_getpc_b64 s[4:5] -; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[2:3] +; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 -; GFX1064-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1064-NEXT: s_ff1_i32_b32 s3, s0 +; GFX1064-NEXT: s_add_i32 s2, s2, 32 +; GFX1064-NEXT: s_min_u32 s2, s3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execz .LBB8_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1064-NEXT: .LBB8_5: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: @@ -3081,115 +4923,187 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-NEXT: s_mov_b32 s14, s8 -; GFX1032-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-NEXT: s_getpc_b64 s[4:5] -; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[2:3] +; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b32 s12, s6 -; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s13, s7 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 -; GFX1032-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execz .LBB8_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v1, v3, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1032-NEXT: .LBB8_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-NEXT: s_mov_b32 s14, s8 -; GFX1164-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-NEXT: s_getpc_b64 s[4:5] -; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[2:3] +; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-NEXT: s_mov_b32 s12, s6 ; GFX1164-NEXT: s_mov_b32 s13, s7 ; GFX1164-NEXT: s_mov_b32 s32, 0 -; GFX1164-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_ctz_i32_b32 s2, s1 +; GFX1164-NEXT: s_ctz_i32_b32 s3, s0 +; GFX1164-NEXT: s_add_i32 s2, s2, 32 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_min_u32 s2, s3, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 +; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execz .LBB8_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1164-NEXT: .LBB8_5: ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-NEXT: s_getpc_b64 s[4:5] -; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[2:3] +; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execz .LBB8_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: @@ -3200,10 +5114,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX9-DPP-NEXT: s_mov_b32 s14, s8 -; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 @@ -3218,23 +5132,53 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, s4, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: @@ -3244,42 +5188,69 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v3, s2, s3 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: @@ -3289,115 +5260,195 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 -; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 -; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 -; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_subrev_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 -; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: v_sub_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1 -; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index bb422adeaafbf4..41d25328ded8ba 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -197,6 +197,10 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis +; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -257,14 +261,11 @@ ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR late optimizations -; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Post-Dominator Tree Construction -; GCN-O1-NEXT: Cycle Info Analysis -; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Lazy Value Information Analysis ; GCN-O1-NEXT: Lower SwitchInst's to branches @@ -475,6 +476,10 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis +; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -559,14 +564,11 @@ ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations -; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Cycle Info Analysis -; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches @@ -785,6 +787,10 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis +; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -869,14 +875,11 @@ ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR late optimizations -; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Post-Dominator Tree Construction -; GCN-O2-NEXT: Cycle Info Analysis -; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches @@ -1097,6 +1100,10 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis +; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction @@ -1191,14 +1198,11 @@ ; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR late optimizations -; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Post-Dominator Tree Construction -; GCN-O3-NEXT: Cycle Info Analysis -; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index c8168e5e6c6cab..f7addf72895ca0 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -146,50 +146,40 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b64 s[4:5], exec ; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; VI-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s3, s3, 4 -; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; VI-NEXT: s_lshl_b32 s8, s3, 3 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 -; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: ds_add_rtn_f32 v1, v2, v1 +; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: .LBB2_2: ; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: s_mov_b64 s[6:7], exec ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; VI-NEXT: s_cbranch_execz .LBB2_4 ; VI-NEXT: ; %bb.3: -; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: s_lshl_b32 s3, s3, 4 -; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_add_f32 v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: .LBB2_4: -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; VI-NEXT: v_add_f32_e32 v2, s8, v0 -; VI-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 ; VI-NEXT: v_bfrev_b32_e32 v1, 1 ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: .LBB2_5: ; %ComputeLoop @@ -208,7 +198,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: v_add_f32_e32 v1, s9, v1 ; VI-NEXT: s_cbranch_scc1 .LBB2_5 ; VI-NEXT: ; %bb.6: ; %ComputeEnd -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -235,49 +224,39 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 4 -; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_lshl_b32 s8, s3, 3 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_f32 v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX9-NEXT: v_add_f32_e32 v2, s8, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: .LBB2_5: ; %ComputeLoop @@ -296,7 +275,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 ; GFX9-NEXT: ; %bb.6: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -318,68 +296,85 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; ; GFX7-LABEL: lds_ds_fadd: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s4, s3, 3 -; GFX7-NEXT: s_add_i32 s4, s4, 32 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: s_add_i32 s3, s3, 4 -; GFX7-NEXT: s_lshl_b32 s6, s3, 3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7-NEXT: s_add_i32 s5, s5, 4 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB2_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_lshl_b32 s10, s5, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_read_b32 v1, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_3: ; %atomicrmw.start2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7-NEXT: ; %bb.3: ; %Flow15 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB2_4: ; %Flow16 +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s8, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7-NEXT: s_cbranch_execz .LBB2_7 +; GFX7-NEXT: ; %bb.5: +; GFX7-NEXT: s_lshl_b32 s5, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB2_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_3 -; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1 -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB2_6 +; GFX7-NEXT: .LBB2_7: ; %Flow14 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_read_b32 v1, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB2_5: ; %atomicrmw.start8 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: .LBB2_8: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_add_f32_e32 v3, v2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB2_5 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB2_8 +; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -389,67 +384,85 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; ; GFX8-LABEL: lds_ds_fadd: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s4, s3, 3 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: ds_read_b32 v0, v0 offset:32 -; GFX8-NEXT: s_add_i32 s3, s3, 4 -; GFX8-NEXT: s_lshl_b32 s6, s3, 3 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8-NEXT: s_add_i32 s5, s5, 4 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_lshl_b32 s10, s5, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: ds_read_b32 v2, v2 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_add_f32_e32 v4, v3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_lshl_b32 s3, s3, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: ds_read_b32 v1, v1 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v3 +; GFX8-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB2_2 +; GFX8-NEXT: ; %bb.3: ; %Flow17 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: .LBB2_4: ; %Flow18 +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_readfirstlane_b32 s8, v2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_7 +; GFX8-NEXT: ; %bb.5: +; GFX8-NEXT: s_lshl_b32 s5, s5, 4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: ds_read_b32 v2, v2 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB2_6: ; %atomicrmw.start2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_3 -; GFX8-NEXT: ; %bb.4: ; %atomicrmw.end1 -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB2_6 +; GFX8-NEXT: .LBB2_7: ; %Flow16 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: ds_read_b32 v1, v1 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB2_5: ; %atomicrmw.start8 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX8-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: .LBB2_8: ; %atomicrmw.start8 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_add_f32_e32 v3, v2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB2_5 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end7 -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB2_8 +; GFX8-NEXT: ; %bb.9: ; %atomicrmw.end7 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -474,46 +487,36 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b64 s[4:5], exec ; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; VI-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s3, s3, 4 -; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; VI-NEXT: s_cbranch_execz .LBB3_2 ; VI-NEXT: ; %bb.1: -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; VI-NEXT: s_lshl_b32 s8, s3, 3 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 -; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: ds_add_rtn_f32 v1, v2, v1 +; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 ; VI-NEXT: .LBB3_2: ; VI-NEXT: s_or_b64 exec, exec, s[6:7] -; VI-NEXT: s_mov_b64 s[6:7], exec ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; VI-NEXT: s_cbranch_execz .LBB3_4 ; VI-NEXT: ; %bb.3: -; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: s_lshl_b32 s3, s3, 4 -; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: ds_add_f32 v2, v1 ; VI-NEXT: .LBB3_4: -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; VI-NEXT: v_add_f32_e32 v2, s8, v0 -; VI-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 ; VI-NEXT: v_bfrev_b32_e32 v1, 1 ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: .LBB3_5: ; %ComputeLoop @@ -532,7 +535,6 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: v_add_f32_e32 v1, s9, v1 ; VI-NEXT: s_cbranch_scc1 .LBB3_5 ; VI-NEXT: ; %bb.6: ; %ComputeEnd -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -557,45 +559,35 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[4:5] ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 4 -; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_lshl_b32 s8, s3, 3 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: ds_add_f32 v2, v1 ; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX9-NEXT: v_add_f32_e32 v2, s8, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: .LBB3_5: ; %ComputeLoop @@ -614,7 +606,6 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_5 ; GFX9-NEXT: ; %bb.6: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -634,65 +625,82 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-LABEL: lds_ds_fadd_one_as: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s4, s3, 3 -; GFX7-NEXT: s_add_i32 s4, s4, 32 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: s_add_i32 s3, s3, 4 -; GFX7-NEXT: s_lshl_b32 s6, s3, 3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: s_add_i32 s5, s5, 4 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB3_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_lshl_b32 s10, s5, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_read_b32 v1, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_3: ; %atomicrmw.start2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB3_2 +; GFX7-NEXT: ; %bb.3: ; %Flow15 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB3_4: ; %Flow16 +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s8, v2 +; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7-NEXT: s_cbranch_execz .LBB3_7 +; GFX7-NEXT: ; %bb.5: +; GFX7-NEXT: s_lshl_b32 s5, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB3_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_3 -; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1 -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB3_6 +; GFX7-NEXT: .LBB3_7: ; %Flow14 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_read_b32 v1, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB3_5: ; %atomicrmw.start8 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX7-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: .LBB3_8: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_add_f32_e32 v3, v2, v0 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB3_5 -; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB3_8 +; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -702,64 +710,82 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: lds_ds_fadd_one_as: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s4, s3, 3 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: ds_read_b32 v0, v0 offset:32 -; GFX8-NEXT: s_add_i32 s3, s3, 4 -; GFX8-NEXT: s_lshl_b32 s6, s3, 3 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8-NEXT: s_add_i32 s5, s5, 4 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr2 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_lshl_b32 s10, s5, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: ds_read_b32 v2, v2 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_lshl_b32 s3, s3, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: ds_read_b32 v1, v1 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_3: ; %atomicrmw.start2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v3 +; GFX8-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB3_2 +; GFX8-NEXT: ; %bb.3: ; %Flow17 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: .LBB3_4: ; %Flow18 +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_readfirstlane_b32 s8, v2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB3_7 +; GFX8-NEXT: ; %bb.5: +; GFX8-NEXT: s_lshl_b32 s5, s5, 4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: ds_read_b32 v2, v2 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB3_6: ; %atomicrmw.start2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_3 -; GFX8-NEXT: ; %bb.4: ; %atomicrmw.end1 -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB3_6 +; GFX8-NEXT: .LBB3_7: ; %Flow16 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: ds_read_b32 v1, v1 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB3_5: ; %atomicrmw.start8 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX8-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: .LBB3_8: ; %atomicrmw.start8 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_add_f32_e32 v3, v2, v0 ; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB3_5 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end7 -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB3_8 +; GFX8-NEXT: ; %bb.9: ; %atomicrmw.end7 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1