Skip to content

Commit

Permalink
[AMDGPU] PeepholeSDWA: Don't assume inst srcs are registers (#69576)
Browse files Browse the repository at this point in the history
To fix that ticket we only needed to address the V_LSHLREV_B16 case, but
I did it for all insts just in case.

Fixes #66899
  • Loading branch information
Pierre-vh authored Oct 19, 2023
1 parent 2b97fe2 commit d2edff8
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 4 deletions.
12 changes: 8 additions & 4 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {

MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
if (!Src1->isReg() || Src1->getReg().isPhysical() ||
Dst->getReg().isPhysical())
break;

if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
Expand Down Expand Up @@ -584,7 +585,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
if (!Src1->isReg() || Src1->getReg().isPhysical() ||
Dst->getReg().isPhysical())
break;

if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
Expand Down Expand Up @@ -647,7 +649,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
if (!Src0->isReg() || Src0->getReg().isPhysical() ||
Dst->getReg().isPhysical())
break;

return std::make_unique<SDWASrcOperand>(
Expand Down Expand Up @@ -675,7 +678,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {

MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
Dst->getReg().isPhysical())
break;

return std::make_unique<SDWASrcOperand>(
Expand Down
107 changes: 107 additions & 0 deletions llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2098,6 +2098,113 @@ bb11: ; preds = %bb10, %bb2
br label %bb1
}

define void @crash_lshlrevb16_not_reg_op() {
; NOSDWA-LABEL: crash_lshlrevb16_not_reg_op:
; NOSDWA: ; %bb.0: ; %bb0
; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
; NOSDWA-NEXT: v_lshlrev_b16_e64 v3, 8, 1
; NOSDWA-NEXT: .LBB22_1: ; %bb1
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
; NOSDWA-NEXT: v_mov_b32_e32 v2, 0xff
; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_mov_b64 s[4:5], 1
; NOSDWA-NEXT: v_and_b32_e32 v2, s4, v2
; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3
; NOSDWA-NEXT: v_lshrrev_b16_e32 v2, s6, v2
; NOSDWA-NEXT: flat_store_byte v[0:1], v2
; NOSDWA-NEXT: s_mov_b64 vcc, vcc
; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1
; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock
; NOSDWA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; NOSDWA-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: crash_lshlrevb16_not_reg_op:
; GFX89: ; %bb.0: ; %bb0
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b64 s[4:5], 0
; GFX89-NEXT: s_and_b64 vcc, exec, -1
; GFX89-NEXT: v_lshlrev_b16_e64 v0, 8, 1
; GFX89-NEXT: .LBB22_1: ; %bb1
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX89-NEXT: v_mov_b32_e32 v3, s4
; GFX89-NEXT: s_lshl_b32 s6, s4, 3
; GFX89-NEXT: v_mov_b32_e32 v1, s4
; GFX89-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: v_mov_b32_e32 v2, s5
; GFX89-NEXT: s_mov_b64 s[4:5], 1
; GFX89-NEXT: v_lshrrev_b16_e32 v3, s6, v3
; GFX89-NEXT: flat_store_byte v[1:2], v3
; GFX89-NEXT: s_mov_b64 vcc, vcc
; GFX89-NEXT: s_cbranch_vccnz .LBB22_1
; GFX89-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: crash_lshlrevb16_not_reg_op:
; GFX9: ; %bb.0: ; %bb0
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, 1
; GFX9-NEXT: s_and_b64 vcc, exec, -1
; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: .LBB22_1: ; %bb1
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_lshl_b32 s6, s4, 3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: s_mov_b64 s[4:5], 1
; GFX9-NEXT: v_lshrrev_b16_e32 v3, s6, v0
; GFX9-NEXT: flat_store_byte v[1:2], v3
; GFX9-NEXT: s_mov_b64 vcc, vcc
; GFX9-NEXT: s_cbranch_vccnz .LBB22_1
; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
; GFX10: ; %bb.0: ; %bb0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, 8, 1
; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
; GFX10-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB22_1: ; %bb1
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_lshl_b32 s6, s4, 3
; GFX10-NEXT: v_mov_b32_e32 v1, s4
; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: v_lshrrev_b16 v3, s6, v0
; GFX10-NEXT: s_mov_b64 s[4:5], 1
; GFX10-NEXT: flat_store_byte v[1:2], v3
; GFX10-NEXT: s_cbranch_vccnz .LBB22_1
; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
%1 = alloca [2 x i8], align 1, addrspace(5)
%2 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 1
br label %bb0

bb0:
store i8 1, ptr addrspace(5) %2, align 1
br label %bb1

bb1:
%3 = phi i64 [ 1, %bb1 ], [ 0, %bb0 ]
%4 = trunc i64 %3 to i32
%5 = getelementptr i8, ptr addrspace(5) %1, i32 %4
%6 = load i8, ptr addrspace(5) %5, align 1
%7 = getelementptr i8, ptr null, i64 %3
store i8 %6, ptr %7, align 1
br i1 false, label %bb2, label %bb1

bb2:
br label %bb0
}

declare i32 @llvm.amdgcn.workitem.id.x()

attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
Expand Down

0 comments on commit d2edff8

Please sign in to comment.