-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAG: Handle vector legalization of minimumnum/maximumnum #109779
Conversation
Follow the same patterns as the other min/max variants.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesFollow the same patterns as the other min/max variants. Patch is 110.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109779.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1c466ed0b77997..0a22f06271984e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4537,6 +4537,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_FMINIMUM:
case ISD::FMAXIMUM:
case ISD::VP_FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::SMIN: case ISD::VP_SMIN:
case ISD::SMAX: case ISD::VP_SMAX:
case ISD::UMIN: case ISD::VP_UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a2a232ed93b72f..f19975557a0a77 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8606,6 +8606,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags);
}
+ if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return DAG.UnrollVectorOp(Node);
+
// If only one operand is NaN, override it with another operand.
if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS)) {
LHS = DAG.getSelectCC(DL, LHS, LHS, RHS, LHS, ISD::SETUO);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a9754ba357893f..749be5632733a1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -753,7 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
- setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
+ setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
+ ISD::FMAXIMUMNUM},
{MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
Custom);
@@ -5842,6 +5843,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::UADDSAT:
case ISD::USUBSAT:
case ISD::SADDSAT:
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 506f40516c9e6e..25a6c80b917946 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1734,3 +1734,1268 @@ define double @v_maximumnum_f64_fneg(double %x, double %y) {
%result = call double @llvm.maximumnum.f64(double %fneg.x, double %fneg.y)
ret double %result
}
+
+define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %x, <2 x half> %y)
+ ret <2 x half> %result
+}
+
+define <2 x half> @v_maximumnum_v2f16_nnan(<2 x half> %x, <2 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v2f16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v2f16_nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2f16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v2f16_nnan:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v2f16_nnan:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <2 x half> @llvm.maximumnum.v2f16(<2 x half> %x, <2 x half> %y)
+ ret <2 x half> %result
+}
+
+define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v3f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y)
+ ret <3 x half> %result
+}
+
+define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v3f16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f16_nnan:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f16_nnan:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y)
+ ret <3 x half> %result
+}
+
+define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+ ret <4 x half> %result
+}
+
+define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v4f16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f16_nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f16_nnan:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f16_nnan:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+ ret <4 x half> %result
+}
+
+define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
+; GFX8-LABEL: v_maximumnum_v6f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v6
+; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v4, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v6f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX9-NEXT: v_pk_max_f16 v3, v4, v4
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX9-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v2, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v6f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v4
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v6f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v4
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v6f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(...
[truncated]
|
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Follow the same patterns as the other min/max variants.
Follow the same patterns as the other min/max variants.