-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
release/19.x: AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (#110256) #110470
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: None (llvmbot) ChangesRequested by: @arsenm Patch is 29.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110470.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b7471bab128509..7b786ee2641721 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
0);
}
- Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32);
return true;
}
@@ -1967,7 +1967,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
- Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
return true;
}
}
@@ -2000,7 +2000,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
return false;
SAddr = SelectSAddrFI(CurDAG, SAddr);
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a5e4151bf36958..47ca6f416b02b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1513,4 +1513,243 @@ bb:
ret void
}
+define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
+; GFX9-NEXT: scratch_load_dword v2, off, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
+; GFX10-NEXT: scratch_load_dword v2, off, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
+; GFX940-NEXT: scratch_load_dword v2, off, s0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
+; GFX11-NEXT: scratch_load_b32 v2, off, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+entry:
+ %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
+ %load = load i32, ptr addrspace(5) %large_offset, align 4
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset_split:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: s_and_b32 s0, s2, -4
+; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
+; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset_split:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: s_and_b32 s0, s2, -4
+; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
+; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset_split:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_and_b32 s0, s0, -4
+; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
+; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset_split:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_and_b32 s0, s0, -4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
+; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset_split:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_and_b32 s0, s0, -4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
+; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+entry:
+ ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
+ %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
+ %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
+ %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
+ %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
+ %load = load volatile i32, ptr addrspace(5) %split_offset, align 4
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-NEXT: scratch_store_dword v0, v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
+; GFX10-NEXT: scratch_store_dword v0, v1, off
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX940: ; %bb.0: ; %bb
+; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
+; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, 15
+; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
+; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_endpgm
+bb:
+ %add1 = add nsw i32 %sidx, %vidx
+ %add2 = add nsw i32 %add1, 65512
+ %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+ store volatile i32 15, ptr addrspace(5) %gep, align 4
+ ret void
+}
+
+define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
+; GFX9-LABEL: sgpr_base_negative_offset:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
+; GFX9-NEXT: scratch_load_dword v2, off, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_negative_offset:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_negative_offset:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
+; GFX940-NEXT: scratch_load_dword v2, off, s0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_negative_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_negative_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+entry:
+ %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
+ %0 = load i32, ptr addrspace(5) %scevgep28, align 4
+ store i32 %0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 14d8b71c5167a2..284f1746145225 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4921,5 +4921,449 @@ bb:
ret void
}
+define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: s_add_i32 s2, s2, 0xffe8
+; GFX9-NEXT: scratch_load_dword v2, off, s2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: s_add_i32 s2, s2, 0xffe8
+; GFX10-NEXT: scratch_load_dword v2, off, s2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_add_i32 s0, s0, 0xffe8
+; GFX11-NEXT: scratch_load_b32 v2, off, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_large_offset:
+; GFX9-PAL: ; %bb.0: ; %entry
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s8
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0xffe8
+; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_add_i32 s0, s0, 0xffe8
+; GFX940-NEXT: scratch_load_dword v2, off, s0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_large_offset:
+; GFX10-PAL: ; %bb.0: ; %entry
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s8
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0xffe8
+; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-PAL-NEXT: s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_large_offset:
+; GFX11-PAL: ; %bb.0: ; %entry
+; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0xffe8
+; GFX11-PAL-NEXT: scratch_load_b32 v2, off, s0
+; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-PAL-NEXT: s_nop 0
+; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-PAL-NEXT: s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_large_offset:
+; GFX12-PAL: ; %bb.0: ; %entry
+; GFX12-PAL-NEXT: scratch_load_b32 v2, off, s0 offset:65512
+; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-PAL-NEXT: s_nop 0
+; GFX12-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-PAL-NEXT: s_endpgm
+entry:
+ %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
+ %load = load i32, ptr addrspace(5) %large_offset, align 4
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
+; GFX9-LABEL: sgpr_base_large_offset_split:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: s_and_b32 s0, s2, -4
+; GFX9-NEXT: s_add_i32 s0, s0, 0x100f000
+; GFX9-NEXT: scratch_load_dword v2, off, s0 offset:4072 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_large_offset_split:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: s_and_b32 s0, s2, -4
+; GFX10-NEXT: s_add_i32 s0, s0, 0x100f800
+; GFX10-NEXT: scratch_load_dword v2, off, s0 offset:2024 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_large_offset_split:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x100f000
+; GFX11-NEXT: s_and_b32 s0, s0, -4
+; GFX11-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_large_offset_split:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: v_mov_b32_e32 v2, 0x1000000
+; GFX12-NEXT: s_and_b32 s0, s0, -4
+; GFX12-NEXT: scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX9-PAL: ; %bb.0: ; %entry
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s8
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, -4
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0x100f000
+; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 offset:4072 glc
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_large_offset_split:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_and_b32 s0, s0, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, 0x100f000
+; GFX940-NEXT: scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX10-PAL: ; %bb.0: ; %entry
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s8
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: s_and_b32 s0, s0, -4
+; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0x100f800
+; GFX10-PAL-NEXT: scratch_load_dword v2, off, s0 offset:2024 glc dlc
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-PAL-NEXT: s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_large_offset_split:
+; GFX11-PAL: ; %bb.0: ; %entry
+; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0x100f000
+; GFX11-PAL-NEXT: s_and_b32 s0, s0, -4
+; GFX11-PAL-NEXT: scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
+; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-PAL-NEXT: s_nop 0
+; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-PAL-NEXT: s_...
[truncated]
|
Large scratch offset with one on highest bit selected as negative, negative offset has same binary representation in 16 bits as large unsigned offset. (cherry picked from commit e9d12a6)
…lvm#110256) Use i32 for offset instead of i16, this way it does not get interpreted as negative 16 bit offset. (cherry picked from commit 83fe851)
@arsenm (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport e9d12a6 83fe851
Requested by: @arsenm