diff --git a/deps/llvm.mk b/deps/llvm.mk index a0cb21e7f13f0..efe8bd4100567 100644 --- a/deps/llvm.mk +++ b/deps/llvm.mk @@ -429,6 +429,7 @@ $(eval $(call LLVM_PATCH,llvm-9.0-D65174-limit-merge-stores)) # remove for 10.0 $(eval $(call LLVM_PATCH,llvm9-D71443-PPC-MC-redef-symbol)) # remove for 10.0 $(eval $(call LLVM_PATCH,llvm-9.0-D78196)) # remove for 11.0 $(eval $(call LLVM_PATCH,llvm-julia-tsan-custom-as)) +$(eval $(call LLVM_PATCH,llvm-9.0-D85499)) endif # LLVM_VER 9.0 ifeq ($(LLVM_VER_SHORT),10.0) diff --git a/deps/patches/llvm-9.0-D85499.patch b/deps/patches/llvm-9.0-D85499.patch new file mode 100644 index 0000000000000..1be91fc4717f5 --- /dev/null +++ b/deps/patches/llvm-9.0-D85499.patch @@ -0,0 +1,425 @@ +commit ac8729e23232d0fd3933b76093a40b7c65332aff +Author: Keno Fischer +Date: Fri Aug 7 00:31:43 2020 -0400 + + [X86] Canonicalize andnp for bitmask arithmetic + + We have a DAG combine that tries to fold (vselect cond, 0000..., X) -> (andnp cond, x). + However, it does so by attempting to create an i64 vector with the number + of elements obtained by truncating division by 64 from the bitwidth. This is + bad for mask vectors like v8i1, since that division is just zero. Besides, + we don't want i64 vectors anyway. The easy change is just to avoid changing + the VT, but this is slightly problematic because the canonical pattern for + `kandn` is `(and (vnot a) b)` rather than `(x86andnp a b)`, so this fails + to select. Rather than playing games here with having the mask vectors + use a different canonical representation, the bulk of this commit switches + the canonical ISD representation for `kandn` to `(x86andnp a b)` such + that all vector types may be handled equally here. To avoid regressing + other tests, we need to extend a few other folds to handle `x86andnp` in + addition to plain `and`. However, that should be generally a good + improvement, since x86andnp is already canonical for non-i1 vectors + prior to this commit, and said folds were just missing. + + When all is said and done, fixes the issue reported in + https://github.com/JuliaLang/julia/issues/36955. + + Differential Revision: https://reviews.llvm.org/D85499 + +diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp +index 34ad589d205..eb21b0de89d 100644 +--- a/lib/Target/X86/X86ISelDAGToDAG.cpp ++++ b/lib/Target/X86/X86ISelDAGToDAG.cpp +@@ -503,7 +503,7 @@ namespace { + bool isMaskZeroExtended(SDNode *N) const; + bool tryShiftAmountMod(SDNode *N); + bool tryShrinkShlLogicImm(SDNode *N); +- bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); ++ bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask, bool Invert); + + MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node); +@@ -2998,7 +2998,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { + bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); + // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. + if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { +- unsigned NewOpc = ++ unsigned NewOpc = + ((Opc == X86ISD::ADD) == IsOne) + ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) + : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); +@@ -3999,8 +3999,8 @@ static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, + + // Try to create VPTESTM instruction. If InMask is not null, it will be used + // to form a masked operation. +-bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, +- SDValue InMask) { ++bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue InMask, ++ bool Invert) { + assert(Subtarget->hasAVX512() && "Expected AVX512!"); + assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected VT!"); +@@ -4140,6 +4140,9 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, + } + + bool IsTestN = CC == ISD::SETEQ; ++ if (Invert) ++ IsTestN = !IsTestN; ++ + unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, + IsMasked); + +@@ -4309,16 +4312,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) { + return; + break; + ++ case X86ISD::ANDNP: ++ if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { ++ SDValue N0 = Node->getOperand(0); ++ SDValue N1 = Node->getOperand(1); ++ // Try to form a masked VPTESTM ++ if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && ++ tryVPTESTM(Node, N0, N1, true)) ++ return; ++ } ++ break; ++ + case ISD::AND: + if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { + // Try to form a masked VPTESTM. Operands can be in either order. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && +- tryVPTESTM(Node, N0, N1)) ++ tryVPTESTM(Node, N0, N1, false)) + return; + if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && +- tryVPTESTM(Node, N1, N0)) ++ tryVPTESTM(Node, N1, N0, false)) + return; + } + +@@ -5000,7 +5014,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { + } + + case ISD::SETCC: { +- if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) ++ if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue(), false)) + return; + + break; +diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp +index 920cdd7e625..6b9738074c7 100644 +--- a/lib/Target/X86/X86ISelLowering.cpp ++++ b/lib/Target/X86/X86ISelLowering.cpp +@@ -196,7 +196,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, + // Integer absolute. + if (Subtarget.hasCMov()) { + setOperationAction(ISD::ABS , MVT::i16 , Custom); +- setOperationAction(ISD::ABS , MVT::i32 , Custom); ++ setOperationAction(ISD::ABS , MVT::i32 , Custom); + } + setOperationAction(ISD::ABS , MVT::i64 , Custom); + +@@ -26053,7 +26053,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + + // If this is a canonical idempotent atomicrmw w/no uses, we have a better + // lowering available in lowerAtomicArith. +- // TODO: push more cases through this path. ++ // TODO: push more cases through this path. + if (auto *C = dyn_cast(AI->getValOperand())) + if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && + AI->use_empty()) +@@ -26111,7 +26111,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + /// Emit a locked operation on a stack location which does not change any + /// memory location, but does involve a lock prefix. Location is chosen to be + /// a) very likely accessed only by a single thread to minimize cache traffic, +-/// and b) definitely dereferenceable. Returns the new Chain result. ++/// and b) definitely dereferenceable. Returns the new Chain result. + static SDValue emitLockedStackOp(SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, SDLoc DL) { +@@ -26120,22 +26120,22 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG, + // operations issued by the current processor. As such, the location + // referenced is not relevant for the ordering properties of the instruction. + // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, +- // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions ++ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 2) Using an immediate operand appears to be the best encoding choice + // here since it doesn't require an extra register. + // 3) OR appears to be very slightly faster than ADD. (Though, the difference + // is small enough it might just be measurement noise.) + // 4) When choosing offsets, there are several contributing factors: + // a) If there's no redzone, we default to TOS. (We could allocate a cache +- // line aligned stack object to improve this case.) ++ // line aligned stack object to improve this case.) + // b) To minimize our chances of introducing a false dependence, we prefer +- // to offset the stack usage from TOS slightly. ++ // to offset the stack usage from TOS slightly. + // c) To minimize concerns about cross thread stack usage - in particular, + // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which + // captures state in the TOS frame and accesses it from many threads - + // we want to use an offset such that the offset is in a distinct cache + // line from the TOS frame. +- // ++ // + // For a general discussion of the tradeoffs and benchmark results, see: + // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + +@@ -26188,7 +26188,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, + if (Subtarget.hasMFence()) + return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); + +- SDValue Chain = Op.getOperand(0); ++ SDValue Chain = Op.getOperand(0); + return emitLockedStackOp(DAG, Subtarget, Chain, dl); + } + +@@ -26677,12 +26677,12 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, + // seq_cst which isn't SingleThread, everything just needs to be preserved + // during codegen and then dropped. Note that we expect (but don't assume), + // that orderings other than seq_cst and acq_rel have been canonicalized to +- // a store or load. ++ // a store or load. + if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && + AN->getSyncScopeID() == SyncScope::System) { + // Prefer a locked operation against a stack location to minimize cache + // traffic. This assumes that stack locations are very likely to be +- // accessed only by the owning thread. ++ // accessed only by the owning thread. + SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); + assert(!N->hasAnyUseOfValue(0)); + // NOTE: The getUNDEF is needed to give something for the unused result 0. +@@ -35620,7 +35620,7 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { + } + + // TODO: This switch could include FNEG and the x86-specific FP logic ops +- // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid ++ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // missed load folding and fma+fneg combining. + switch (Vec.getOpcode()) { + case ISD::FMA: // Begin 3 operands +@@ -35935,10 +35935,8 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, + + // vselect Cond, 000..., X -> andn Cond, X + if (TValIsAllZeros) { +- MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64); +- SDValue CastCond = DAG.getBitcast(AndNVT, Cond); +- SDValue CastRHS = DAG.getBitcast(AndNVT, RHS); +- SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS); ++ SDValue CastRHS = DAG.getBitcast(CondVT, RHS); ++ SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS); + return DAG.getBitcast(VT, AndN); + } + +@@ -38147,12 +38145,17 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { + return SDValue(); + } + +-/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). +-static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { ++ ++/// Try to fold: ++/// (and (not X), Y) -> (andnp X, Y) ++/// (and (xor X, -1), Y) -> (andnp X, Y). ++static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG, ++ const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::AND); + + MVT VT = N->getSimpleValueType(0); +- if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) ++ if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector() && ++ !(VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())) + return SDValue(); + + SDValue X, Y; +@@ -38558,7 +38561,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + +- if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) ++ if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG, Subtarget)) + return R; + + if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) +diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td +index 54eddeacaa1..91027fa903f 100644 +--- a/lib/Target/X86/X86InstrAVX512.td ++++ b/lib/Target/X86/X86InstrAVX512.td +@@ -2978,7 +2978,6 @@ multiclass avx512_mask_binop_all opc, string OpcodeStr, + def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; + def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; + // These nodes use 'vnot' instead of 'not' to support vectors. +-def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; + def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; + + // TODO - do we need a X86SchedWriteWidths::KMASK type? +@@ -2986,7 +2985,7 @@ defm KAND : avx512_mask_binop_all<0x41, "kand", and, SchedWriteVecLogic.XM + defm KOR : avx512_mask_binop_all<0x45, "kor", or, SchedWriteVecLogic.XMM, 1>; + defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SchedWriteVecLogic.XMM, 1>; + defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XMM, 1>; +-defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>; ++defm KANDN : avx512_mask_binop_all<0x42, "kandn", X86andnp, SchedWriteVecLogic.XMM, 0>; + defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>; + + multiclass avx512_binop_pat; +-defm : avx512_binop_pat; ++defm : avx512_binop_pat; + defm : avx512_binop_pat; + defm : avx512_binop_pat; + defm : avx512_binop_pat; +@@ -11570,7 +11569,7 @@ multiclass avx512_fixupimm_scalar opc, string OpcodeStr, + } + + multiclass avx512_fixupimm_packed_all { + let Predicates = [HasAVX512] in + defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM, +@@ -11687,7 +11686,7 @@ multiclass AVX512_scalar_math_fp_patterns("V"#OpcPrefix#Zrr_Intkz) ++ (!cast("V"#OpcPrefix#Zrr_Intkz) + VK1WM:$mask, _.VT:$src1, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + def : Pat<(MoveNode (_.VT VR128X:$src1), +diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll +index 1ed7b408baf..64320d63eac 100644 +--- a/test/CodeGen/X86/avx512-select.ll ++++ b/test/CodeGen/X86/avx512-select.ll +@@ -595,3 +595,74 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16> + %tmp3 = zext <16 x i16> %tmp2 to <16 x i64> + ret <16 x i64> %tmp3 + } ++ ++; Regression test from https://github.com/JuliaLang/julia/issues/36955 ++define i8 @julia_issue36955(<8 x i1> %mask, <8 x double> %a) { ++; X86-AVX512F-LABEL: julia_issue36955: ++; X86-AVX512F: # %bb.0: ++; X86-AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ++; X86-AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ++; X86-AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ++; X86-AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 ++; X86-AVX512F-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 ++; X86-AVX512F-NEXT: kandnw %k0, %k1, %k0 ++; X86-AVX512F-NEXT: kandw %k1, %k0, %k0 ++; X86-AVX512F-NEXT: knotw %k1, %k1 ++; X86-AVX512F-NEXT: korw %k1, %k0, %k0 ++; X86-AVX512F-NEXT: kmovw %k0, %eax ++; X86-AVX512F-NEXT: # kill: def $al killed $al killed $eax ++; X86-AVX512F-NEXT: vzeroupper ++; X86-AVX512F-NEXT: retl ++; ++; X64-AVX512F-LABEL: julia_issue36955: ++; X64-AVX512F: # %bb.0: ++; X64-AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ++; X64-AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ++; X64-AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ++; X64-AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 ++; X64-AVX512F-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 ++; X64-AVX512F-NEXT: kandnw %k0, %k1, %k0 ++; X64-AVX512F-NEXT: kandw %k1, %k0, %k0 ++; X64-AVX512F-NEXT: knotw %k1, %k1 ++; X64-AVX512F-NEXT: korw %k1, %k0, %k0 ++; X64-AVX512F-NEXT: kmovw %k0, %eax ++; X64-AVX512F-NEXT: # kill: def $al killed $al killed $eax ++; X64-AVX512F-NEXT: vzeroupper ++; X64-AVX512F-NEXT: retq ++; ++; X86-AVX512BW-LABEL: julia_issue36955: ++; X86-AVX512BW: # %bb.0: ++; X86-AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 ++; X86-AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ++; X86-AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ++; X86-AVX512BW-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 ++; X86-AVX512BW-NEXT: kandnw %k0, %k1, %k0 ++; X86-AVX512BW-NEXT: kandw %k1, %k0, %k0 ++; X86-AVX512BW-NEXT: knotw %k1, %k1 ++; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 ++; X86-AVX512BW-NEXT: kmovd %k0, %eax ++; X86-AVX512BW-NEXT: # kill: def $al killed $al killed $eax ++; X86-AVX512BW-NEXT: vzeroupper ++; X86-AVX512BW-NEXT: retl ++; ++; X64-AVX512BW-LABEL: julia_issue36955: ++; X64-AVX512BW: # %bb.0: ++; X64-AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 ++; X64-AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ++; X64-AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ++; X64-AVX512BW-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 ++; X64-AVX512BW-NEXT: kandnw %k0, %k1, %k0 ++; X64-AVX512BW-NEXT: kandw %k1, %k0, %k0 ++; X64-AVX512BW-NEXT: knotw %k1, %k1 ++; X64-AVX512BW-NEXT: korw %k1, %k0, %k0 ++; X64-AVX512BW-NEXT: kmovd %k0, %eax ++; X64-AVX512BW-NEXT: # kill: def $al killed $al killed $eax ++; X64-AVX512BW-NEXT: vzeroupper ++; X64-AVX512BW-NEXT: retq ++ %fcmp = fcmp ugt <8 x double> %a, zeroinitializer ++ %xor = xor <8 x i1> %fcmp, ++ %select1 = select <8 x i1> %fcmp, <8 x i1> zeroinitializer, <8 x i1> %mask ++ %select2 = select <8 x i1> %xor, <8 x i1> , <8 x i1> %select1 ++ %ret = bitcast <8 x i1> %select2 to i8 ++ ret i8 %ret ++} +diff --git a/test/CodeGen/X86/combine-bitselect.ll b/test/CodeGen/X86/combine-bitselect.ll +index 8cb6a4dca09..3c08a871c86 100644 +--- a/test/CodeGen/X86/combine-bitselect.ll ++++ b/test/CodeGen/X86/combine-bitselect.ll +@@ -616,13 +616,13 @@ define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) { + ; AVX512F: # %bb.0: # %bb + ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 + ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12] +-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 ++; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ++; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [12,12,12,12] + ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] +-; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm1, %k2 +-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k2} +-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +-; AVX512F-NEXT: korw %k0, %k1, %k1 ++; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 ++; AVX512F-NEXT: vpcmpeqd %zmm0, %zmm1, %k2 {%k1} ++; AVX512F-NEXT: kandnw %k0, %k1, %k0 ++; AVX512F-NEXT: korw %k0, %k2, %k1 + ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} + ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 + ; AVX512F-NEXT: vzeroupper +diff --git a/test/CodeGen/X86/vec_ssubo.ll b/test/CodeGen/X86/vec_ssubo.ll +index 515dc5c5aa2..dfb1e7c4dee 100644 +--- a/test/CodeGen/X86/vec_ssubo.ll ++++ b/test/CodeGen/X86/vec_ssubo.ll +@@ -1640,7 +1640,7 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind + ; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 + ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 + ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 +-; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k2 {%k1} ++; AVX512-NEXT: kandnw %k1, %k0, %k2 + ; AVX512-NEXT: kxorw %k0, %k1, %k0 + ; AVX512-NEXT: kxorw %k2, %k0, %k1 + ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +diff --git a/test/CodeGen/X86/vec_usubo.ll b/test/CodeGen/X86/vec_usubo.ll +index c5a7b19cf14..367c491d25a 100644 +--- a/test/CodeGen/X86/vec_usubo.ll ++++ b/test/CodeGen/X86/vec_usubo.ll +@@ -1244,10 +1244,10 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind + ; AVX512: # %bb.0: + ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 + ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +-; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +-; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 ++; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 ++; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 + ; AVX512-NEXT: kxorw %k1, %k0, %k1 +-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} ++; AVX512-NEXT: kandnw %k1, %k0, %k2 + ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 + ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} + ; AVX512-NEXT: kmovd %k1, %eax