Skip to content

Commit

Permalink
AMDGPU: Add test for 16 bit unsigned scratch offsets (#110255)
Browse files Browse the repository at this point in the history
Large scratch offset with one on highest bit selected as negative,
negative offset has same binary representation in 16 bits as large
unsigned offset.
  • Loading branch information
petar-avramovic authored Sep 30, 2024
1 parent f627c45 commit e9d12a6
Show file tree
Hide file tree
Showing 2 changed files with 683 additions and 0 deletions.
239 changes: 239 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1510,4 +1510,243 @@ bb:
ret void
}

define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
; GFX9-LABEL: sgpr_base_large_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: s_add_u32 s0, s2, 0xffe8
; GFX9-NEXT: scratch_load_dword v2, off, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sgpr_base_large_offset:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_add_u32 s0, s0, s5
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: s_add_u32 s0, s2, 0xffe8
; GFX10-NEXT: scratch_load_dword v2, off, s0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX940-LABEL: sgpr_base_large_offset:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_add_u32 s0, s0, 0xffe8
; GFX940-NEXT: scratch_load_dword v2, off, s0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: sgpr_base_large_offset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_add_u32 s0, s0, 0xffe8
; GFX11-NEXT: scratch_load_b32 v2, off, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sgpr_base_large_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:65512
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
entry:
%large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
%load = load i32, ptr addrspace(5) %large_offset, align 4
store i32 %load, ptr addrspace(1) %out
ret void
}

define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
; GFX9-LABEL: sgpr_base_large_offset_split:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: s_and_b32 s0, s2, -4
; GFX9-NEXT: s_add_u32 s0, s0, 0x100ffe8
; GFX9-NEXT: scratch_load_dword v2, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sgpr_base_large_offset_split:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_add_u32 s0, s0, s5
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: s_and_b32 s0, s2, -4
; GFX10-NEXT: s_add_u32 s0, s0, 0x100ffe8
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX940-LABEL: sgpr_base_large_offset_split:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_and_b32 s0, s0, -4
; GFX940-NEXT: s_add_u32 s0, s0, 0x100ffe8
; GFX940-NEXT: scratch_load_dword v2, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: sgpr_base_large_offset_split:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_and_b32 s0, s0, -4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s0, s0, 0x100ffe8
; GFX11-NEXT: scratch_load_b32 v2, off, s0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sgpr_base_large_offset_split:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_and_b32 s0, s0, -4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_co_u32 s0, s0, 0x100ffe8
; GFX12-NEXT: scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
entry:
;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
%sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
%sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
%sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
%split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
%load = load volatile i32, ptr addrspace(5) %split_offset, align 4
store i32 %load, ptr addrspace(1) %out
ret void
}

define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffe8
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_add3_u32 v0, s2, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 15
; GFX9-NEXT: scratch_store_dword v0, v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_add_u32 s0, s0, s5
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 15
; GFX10-NEXT: v_add3_u32 v0, s2, v0, 0xffe8
; GFX10-NEXT: scratch_store_dword v0, v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
; GFX940-NEXT: v_mov_b32_e32 v1, 0xffe8
; GFX940-NEXT: v_add3_u32 v0, s0, v0, v1
; GFX940-NEXT: v_mov_b32_e32 v1, 15
; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 0xffe8
; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
bb:
%add1 = add nsw i32 %sidx, %vidx
%add2 = add nsw i32 %add1, 65512
%gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
store volatile i32 15, ptr addrspace(5) %gep, align 4
ret void
}

define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
; GFX9-LABEL: sgpr_base_negative_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8
; GFX9-NEXT: scratch_load_dword v2, off, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sgpr_base_negative_offset:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_add_u32 s0, s0, s5
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: scratch_load_dword v2, off, s2 offset:-24
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX940-LABEL: sgpr_base_negative_offset:
; GFX940: ; %bb.0: ; %entry
; GFX940-NEXT: s_add_u32 s0, s0, 0xffffffe8
; GFX940-NEXT: scratch_load_dword v2, off, s0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: global_store_dword v[0:1], v2, off sc0 sc1
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: sgpr_base_negative_offset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: scratch_load_b32 v2, off, s0 offset:-24
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sgpr_base_negative_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: scratch_load_b32 v2, off, s0 offset:-24
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
entry:
%scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
%0 = load i32, ptr addrspace(5) %scevgep28, align 4
store i32 %0, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
Loading

0 comments on commit e9d12a6

Please sign in to comment.