Skip to content

Commit

Permalink
[AMDGPU] Reorder atomic optimizer to avoid CAS loop.
Browse files Browse the repository at this point in the history
Expand-Atomic pass emits the CAS loop for FP operations
which limits the optimizations offered by atomic optimizer.

Moving atomic optimizer before expand-atomics allows
better codegen.

Reviewed By: arsenm, #amdgpu

Differential Revision: https://reviews.llvm.org/D157265

Change-Id: I68744786339644060bca4199c041f2020d9b9425
  • Loading branch information
pravinjagtap committed Sep 29, 2023
1 parent 0b01944 commit 30a3adf
Show file tree
Hide file tree
Showing 11 changed files with 7,383 additions and 3,057 deletions.
12 changes: 7 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,13 @@ void AMDGPUPassConfig::addIRPasses() {
if (TM.getOptLevel() > CodeGenOpt::None)
addPass(createInferAddressSpacesPass());

// Run atomic optimizer before Atomic Expand
if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
(TM.getOptLevel() >= CodeGenOpt::Less) &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
}

addPass(createAtomicExpandPass());

if (TM.getOptLevel() > CodeGenOpt::None) {
Expand Down Expand Up @@ -1153,11 +1160,6 @@ bool GCNPassConfig::addPreISel() {
if (TM->getOptLevel() > CodeGenOpt::None)
addPass(createAMDGPULateCodeGenPreparePass());

if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
}

if (TM->getOptLevel() > CodeGenOpt::None)
addPass(createSinkingPass());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,19 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
; GFX90A_GFX940: bb.1 (%ir-block.0):
; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[PRED_COPY:%[0-9]+]]:sreg_32 = PRED_COPY $sgpr0
; GFX90A_GFX940-NEXT: [[PRED_COPY1:%[0-9]+]]:sreg_32 = PRED_COPY $sgpr1
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[PRED_COPY]], %subreg.sub0, [[PRED_COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[PRED_COPY2:%[0-9]+]]:vgpr_32 = PRED_COPY $vgpr0
; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5):
; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[PRED_COPY3:%[0-9]+]]:sreg_64 = PRED_COPY $exec
; GFX90A_GFX940-NEXT: [[PRED_COPY4:%[0-9]+]]:sreg_32 = PRED_COPY [[PRED_COPY3]].sub0
Expand Down Expand Up @@ -196,29 +196,22 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[PRED_COPY18]], implicit $exec
; GFX90A_GFX940-NEXT: [[PRED_COPY19:%[0-9]+]]:vgpr_32 = PRED_COPY [[S_MOV_B32_1]]
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[PRED_COPY19]], implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.36):
; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000)
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_BRANCH %bb.5
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.4.Flow:
; GFX90A_GFX940-NEXT: successors: %bb.6(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.6
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.38):
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.4
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.6 (%ir-block.39):
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
ret void
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.3 (%ir-block.39):
; GFX11-NEXT: bb.3 (%ir-block.36):
; GFX11-NEXT: successors: %bb.5(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
Expand All @@ -220,7 +220,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.6
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.5 (%ir-block.42):
; GFX11-NEXT: bb.5 (%ir-block.39):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2
Expand All @@ -231,7 +231,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PRED_COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.4
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.6 (%ir-block.50):
; GFX11-NEXT: bb.6 (%ir-block.47):
; GFX11-NEXT: $vgpr0 = PRED_COPY [[PHI]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,15 @@ define amdgpu_ps void @main(i32 %arg) {
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_branch .LBB0_2
; GFX10-NEXT: .LBB0_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_execz .LBB0_5
; GFX10-NEXT: .LBB0_2: ; %bb4
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_and_saveexec_b32 s3, s1
; GFX10-NEXT: s_cbranch_execz .LBB0_1
; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1
Expand Down
25 changes: 9 additions & 16 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1)
define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX90A_GFX940-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[PRED_COPY:%[0-9]+]]:vgpr_32 = PRED_COPY $vgpr0
Expand All @@ -156,11 +156,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[PRED_COPY2]], %subreg.sub0, [[PRED_COPY1]], %subreg.sub1
; GFX90A_GFX940-NEXT: [[PRED_COPY3:%[0-9]+]]:sreg_64 = PRED_COPY [[REG_SEQUENCE]]
; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.1
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5):
; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[PRED_COPY4:%[0-9]+]]:sreg_64 = PRED_COPY $exec
; GFX90A_GFX940-NEXT: [[PRED_COPY5:%[0-9]+]]:sreg_32 = PRED_COPY [[PRED_COPY4]].sub1
Expand Down Expand Up @@ -188,30 +188,23 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.2
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.2 (%ir-block.36):
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: bb.2 (%ir-block.35):
; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX90A_GFX940-NEXT: [[PRED_COPY8:%[0-9]+]]:vgpr_32 = PRED_COPY %1
; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[PRED_COPY8]], [[PRED_COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX940-NEXT: S_BRANCH %bb.4
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.3.Flow:
; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.5
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.4 (%ir-block.38):
; GFX90A_GFX940-NEXT: successors: %bb.3(0x80000000)
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.39):
; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37):
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
ret void
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.2
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.2 (%ir-block.39):
; GFX11-NEXT: bb.2 (%ir-block.36):
; GFX11-NEXT: successors: %bb.4(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
Expand All @@ -218,7 +218,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.5
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.4 (%ir-block.42):
; GFX11-NEXT: bb.4 (%ir-block.39):
; GFX11-NEXT: successors: %bb.3(0x80000000)
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
Expand All @@ -228,7 +228,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: bb.5 (%ir-block.50):
; GFX11-NEXT: bb.5 (%ir-block.47):
; GFX11-NEXT: $vgpr0 = PRED_COPY [[PHI]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
Expand Down
Loading

0 comments on commit 30a3adf

Please sign in to comment.