diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c38c2dc0f5f6188..e09e09cf179e70f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -306,6 +306,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { return !F || !ST->isSingleLaneExecution(*F); } +unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) { + // For certain 8 bit ops, we can pack a v4i8 into a single part + // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we + // do not limit the numberOfParts for 8 bit vectors to the + // legalization costs of such. It is left up to other target + // queries (e.g. get*InstrCost) to decide the proper handling + // of 8 bit vectors. + if (FixedVectorType *VTy = dyn_cast(Tp)) { + if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) { + unsigned ElCount = VTy->getElementCount().getFixedValue(); + return ElCount / 4; + } + } + + return BaseT::getNumberOfParts(Tp); +} + unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector // registers. See getRegisterClassForType for the implementation. @@ -337,9 +354,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { if (Opcode == Instruction::Load || Opcode == Instruction::Store) return 32 * 4 / ElemWidth; - return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 - : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 - : 1; + + return (ElemWidth == 8) ? 4 + : (ElemWidth == 16) ? 2 + : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 + : 1; } unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -1140,14 +1159,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); - // Larger vector widths may require additional instructions, but are - // typically cheaper than scalarized versions. - unsigned NumVectorElts = cast(VT)->getNumElements(); + unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType()); if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && - DL.getTypeSizeInBits(VT->getElementType()) == 16) { - bool HasVOP3P = ST->hasVOP3PInsts(); + (ScalarSize == 16 || ScalarSize == 8)) { + // Larger vector widths may require additional instructions, but are + // typically cheaper than scalarized versions. + unsigned NumVectorElts = cast(VT)->getNumElements(); unsigned RequestedElts = count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + unsigned EltsPerReg = 32 / ScalarSize; if (RequestedElts == 0) return 0; switch (Kind) { @@ -1156,9 +1176,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, case TTI::SK_PermuteSingleSrc: { // With op_sel VOP3P instructions freely can access the low half or high // half of a register, so any swizzle of two elements is free. - if (HasVOP3P && NumVectorElts == 2) + if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2) return 0; - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; // SK_Broadcast just reuses the same mask unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; return NumPerms + NumPermMasks; @@ -1170,12 +1190,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return 0; // Insert/extract subvectors only require shifts / extract code to get the // relevant bits - return alignTo(RequestedElts, 2) / 2; + return alignTo(RequestedElts, EltsPerReg) / EltsPerReg; } case TTI::SK_PermuteTwoSrc: case TTI::SK_Splice: case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; // SK_Select just reuses the same mask unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; return NumPerms + NumPermMasks; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 01df2e6caaba1d1..6d889dfa93786da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase { return TTI::PSK_FastHardware; } + unsigned getNumberOfParts(Type *Tp); unsigned getNumberOfRegisters(unsigned RCID) const; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; unsigned getMinVectorRegisterBitWidth() const; diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index a18156744a36b1c..adcabe70ec609e5 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -396,157 +396,157 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; Should not assert define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) { ; ALL-LABEL: 'shufflevector_i8' -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'shufflevector_i8' -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer @@ -861,22 +861,22 @@ define amdgpu_kernel void @shufflevector_i32(<2 x i32> %vec1, <2 x i32> %vec2) { ; Other shuffle cases define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i8> %i8v16, <16 x i8> %i8v16_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) { ; GFX9-10-LABEL: 'shuffle' -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -898,22 +898,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; VI-LABEL: 'shuffle' -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -935,22 +935,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; VI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-10-SIZE-LABEL: 'shuffle' -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -972,22 +972,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; VI-SIZE-LABEL: 'shuffle' -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> @@ -1047,9 +1047,9 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) { ; ALL-LABEL: 'concat' -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> @@ -1062,9 +1062,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> ; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> @@ -1080,9 +1080,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'concat' -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> @@ -1095,9 +1095,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll index 3749bdf1bba3943..373c18d0c643176 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -363,11 +363,66 @@ bb: ret <4 x i16> %ins.3 } +define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { +; GCN-LABEL: @uadd_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]]) +; GCN-NEXT: ret <4 x i8> [[TMP0]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 +} + +define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { +; GCN-LABEL: @usub_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]]) +; GCN-NEXT: ret <4 x i8> [[TMP0]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 +} + declare i16 @llvm.uadd.sat.i16(i16, i16) #0 declare i16 @llvm.usub.sat.i16(i16, i16) #0 declare i16 @llvm.sadd.sat.i16(i16, i16) #0 declare i16 @llvm.ssub.sat.i16(i16, i16) #0 +declare i8 @llvm.uadd.sat.i8(i8, i8) #0 +declare i8 @llvm.usub.sat.i8(i8, i8) #0 + declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.usub.sat.i32(i32, i32) #0 declare i32 @llvm.sadd.sat.i32(i32, i32) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 0bb641371825b5d..1d18162a7960e31 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -363,11 +363,67 @@ bb: ret <4 x i16> %ins.3 } +define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1, ptr addrspace(1) %dst) { +; GCN-LABEL: @uadd_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]]) +; GCN-NEXT: ret <4 x i8> [[TMP0]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 +} +define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) { +; GCN-LABEL: @usub_sat_v4i8( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]]) +; GCN-NEXT: ret <4 x i8> [[TMP0]] +; +bb: + %arg0.0 = extractelement <4 x i8> %arg0, i64 0 + %arg0.1 = extractelement <4 x i8> %arg0, i64 1 + %arg0.2 = extractelement <4 x i8> %arg0, i64 2 + %arg0.3 = extractelement <4 x i8> %arg0, i64 3 + %arg1.0 = extractelement <4 x i8> %arg1, i64 0 + %arg1.1 = extractelement <4 x i8> %arg1, i64 1 + %arg1.2 = extractelement <4 x i8> %arg1, i64 2 + %arg1.3 = extractelement <4 x i8> %arg1, i64 3 + %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0) + %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1) + %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2) + %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3) + %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0 + %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1 + %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2 + %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3 + ret <4 x i8> %ins.3 + +} + + declare i16 @llvm.uadd.sat.i16(i16, i16) #0 declare i16 @llvm.usub.sat.i16(i16, i16) #0 declare i16 @llvm.sadd.sat.i16(i16, i16) #0 declare i16 @llvm.ssub.sat.i16(i16, i16) #0 +declare i8 @llvm.uadd.sat.i8(i8, i8) #0 +declare i8 @llvm.usub.sat.i8(i8, i8) #0 + declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.usub.sat.i32(i32, i32) #0 declare i32 @llvm.sadd.sat.i32(i32, i32) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll new file mode 100644 index 000000000000000..f116da325c48d5b --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll @@ -0,0 +1,428 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX9 %s + +define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) { +; GCN-LABEL: @vectorizePHI( +; GCN-NEXT: entry: +; GCN-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GCN-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GCN-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GCN-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GCN-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: br label [[DO_BODY:%.*]] +; GCN: do.body: +; GCN-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ] +; GCN-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 +; GCN-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GCN-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 +; GCN-NEXT: [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 +; GCN-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GCN-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GCN-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GCN-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GCN-NEXT: store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GCN-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; GCN: exit: +; GCN-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16 +; GCN-NEXT: store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16 +; GCN-NEXT: ret void +; +; GFX7-LABEL: @phi( +; GFX7-NEXT: entry: +; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: br label [[DO_BODY:%.*]] +; GFX7: do.body: +; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ] +; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8 +; GFX7-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9 +; GFX7-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10 +; GFX7-NEXT: [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11 +; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GFX7-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX7-NEXT: store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX7-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; GFX7: exit: +; GFX7-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16 +; GFX7-NEXT: store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16 +; GFX7-NEXT: ret void +; +; GFX8PLUS-LABEL: @phi( +; GFX8PLUS-NEXT: entry: +; GFX8PLUS-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX8PLUS-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: br label [[DO_BODY:%.*]] +; GFX8PLUS: do.body: +; GFX8PLUS-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[DO_BODY]] ] +; GFX8PLUS-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: store <16 x i8> [[TMP4]], ptr addrspace(3) [[INPTR1:%.*]], align 2 +; GFX8PLUS-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX8PLUS-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]] +; GFX8PLUS: exit: +; GFX8PLUS-NEXT: store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 16 +; GFX8PLUS-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1:%.*]], align 16 +; GFX8PLUS-NEXT: ret void +; +entry: + %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0 + %ele0 = load i8, ptr addrspace(3) %gep0, align 8 + %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1 + %ele1 = load i8, ptr addrspace(3) %gep1, align 1 + %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2 + %ele2 = load i8, ptr addrspace(3) %gep2, align 2 + %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3 + %ele3 = load i8, ptr addrspace(3) %gep3, align 1 + br label %do.body + +do.body: + %phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ] + %phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ] + %phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ] + %phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ] + %otherele0 = load i8, ptr addrspace(3) %gep0, align 8 + %otherele1 = load i8, ptr addrspace(3) %gep1, align 1 + %otherele2 = load i8, ptr addrspace(3) %gep2, align 2 + %otherele3 = load i8, ptr addrspace(3) %gep3, align 1 + %vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8 + %vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9 + %vec02 = insertelement <16 x i8> %vec01, i8 %otherele2, i64 10 + %vec03 = insertelement <16 x i8> %vec02, i8 %otherele3, i64 11 + %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8 + %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9 + %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10 + %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11 + store <16 x i8> %vec13, ptr addrspace(3) %inptr1, align 2 + %cmp = icmp eq i32 %flag, 0 + br i1 %cmp, label %exit, label %do.body + +exit: + store <16 x i8> %vec13, ptr %out + store <16 x i8> %vec03, ptr %out1 + ret void +} + + +define protected amdgpu_kernel void @arith_phi(ptr addrspace(3) %inptr0, ptr %out, i32 %flag) { +; GCN-LABEL: @vectorizePHI2( +; GCN-NEXT: entry: +; GCN-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GCN-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GCN-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GCN-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GCN-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GCN-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; GCN: bb.1: +; GCN-NEXT: [[ADD0:%.*]] = add i8 [[ELE0]], 1 +; GCN-NEXT: [[ADD1:%.*]] = add i8 [[ELE1]], 1 +; GCN-NEXT: [[ADD2:%.*]] = add i8 [[ELE2]], 1 +; GCN-NEXT: [[ADD3:%.*]] = add i8 [[ELE3]], 1 +; GCN-NEXT: br label [[EXIT]] +; GCN: exit: +; GCN-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ] +; GCN-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ] +; GCN-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ] +; GCN-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ] +; GCN-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GCN-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GCN-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GCN-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GCN-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GCN-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GCN-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GCN-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; +; GFX7-LABEL: @arith_phi( +; GFX7-NEXT: entry: +; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX7-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; GFX7: bb.1: +; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[ELE0]], 1 +; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[ELE1]], 1 +; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[ELE2]], 1 +; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[ELE3]], 1 +; GFX7-NEXT: br label [[EXIT]] +; GFX7: exit: +; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ] +; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ] +; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ] +; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ] +; GFX7-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX7-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX7-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX7-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8 +; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9 +; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10 +; GFX7-NEXT: [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11 +; GFX7-NEXT: store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2 +; GFX7-NEXT: ret void +; +; GFX8PLUS-LABEL: @arith_phi( +; GFX8PLUS-NEXT: entry: +; GFX8PLUS-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0 +; GFX8PLUS-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1 +; GFX8PLUS-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2 +; GFX8PLUS-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3 +; GFX8PLUS-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0 +; GFX8PLUS-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]] +; GFX8PLUS: bb.1: +; GFX8PLUS-NEXT: [[TMP1:%.*]] = add <4 x i8> [[TMP0]], +; GFX8PLUS-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> +; GFX8PLUS-NEXT: br label [[EXIT]] +; GFX8PLUS: exit: +; GFX8PLUS-NEXT: [[TMP3:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB_1]] ] +; GFX8PLUS-NEXT: [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8 +; GFX8PLUS-NEXT: [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1 +; GFX8PLUS-NEXT: [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2 +; GFX8PLUS-NEXT: [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1 +; GFX8PLUS-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 2 +; GFX8PLUS-NEXT: ret void +; +entry: + %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0 + %ele0 = load i8, ptr addrspace(3) %gep0, align 8 + %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1 + %ele1 = load i8, ptr addrspace(3) %gep1, align 1 + %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2 + %ele2 = load i8, ptr addrspace(3) %gep2, align 2 + %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3 + %ele3 = load i8, ptr addrspace(3) %gep3, align 1 + %cmp = icmp eq i32 %flag, 0 + br i1 %cmp, label %exit, label %bb.1 + +bb.1: + %add0 = add i8 %ele0, 1 + %add1 = add i8 %ele1, 1 + %add2 = add i8 %ele2, 1 + %add3 = add i8 %ele3, 1 + br label %exit + +exit: + %phi0 = phi i8 [ %ele3, %entry ], [ %add0, %bb.1 ] + %phi1 = phi i8 [ %ele2, %entry ], [ %add1, %bb.1 ] + %phi2 = phi i8 [ %ele1, %entry ], [ %add2, %bb.1 ] + %phi3 = phi i8 [ %ele0, %entry ], [ %add3, %bb.1 ] + %otherele0 = load i8, ptr addrspace(3) %gep0, align 8 + %otherele1 = load i8, ptr addrspace(3) %gep1, align 1 + %otherele2 = load i8, ptr addrspace(3) %gep2, align 2 + %otherele3 = load i8, ptr addrspace(3) %gep3, align 1 + %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8 + %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9 + %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10 + %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11 + store <16 x i8> %vec13, ptr %out, align 2 + ret void +} + +define protected amdgpu_kernel void @arith(<16 x i8> %invec, ptr %out, i32 %flag) { +; GFX7-LABEL: @arith( +; GFX7-NEXT: entry: +; GFX7-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC:%.*]], i64 0 +; GFX7-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1 +; GFX7-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2 +; GFX7-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3 +; GFX7-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4 +; GFX7-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5 +; GFX7-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6 +; GFX7-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7 +; GFX7-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8 +; GFX7-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9 +; GFX7-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10 +; GFX7-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11 +; GFX7-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12 +; GFX7-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13 +; GFX7-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14 +; GFX7-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15 +; GFX7-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1 +; GFX7-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1 +; GFX7-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1 +; GFX7-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1 +; GFX7-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1 +; GFX7-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1 +; GFX7-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1 +; GFX7-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1 +; GFX7-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1 +; GFX7-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1 +; GFX7-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1 +; GFX7-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1 +; GFX7-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1 +; GFX7-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1 +; GFX7-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1 +; GFX7-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1 +; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1 +; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1 +; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1 +; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1 +; GFX7-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1 +; GFX7-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1 +; GFX7-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1 +; GFX7-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1 +; GFX7-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1 +; GFX7-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1 +; GFX7-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1 +; GFX7-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1 +; GFX7-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1 +; GFX7-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1 +; GFX7-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1 +; GFX7-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1 +; GFX7-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0 +; GFX7-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1 +; GFX7-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2 +; GFX7-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3 +; GFX7-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4 +; GFX7-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5 +; GFX7-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6 +; GFX7-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7 +; GFX7-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8 +; GFX7-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9 +; GFX7-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10 +; GFX7-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11 +; GFX7-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12 +; GFX7-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13 +; GFX7-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14 +; GFX7-NEXT: [[VECINS15:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15 +; GFX7-NEXT: store <16 x i8> [[VECINS15]], ptr [[OUT:%.*]], align 16 +; GFX7-NEXT: ret void +; +; GFX8PLUS-LABEL: @arith( +; GFX8PLUS-NEXT: entry: +; GFX8PLUS-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <4 x i32> +; GFX8PLUS-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], +; GFX8PLUS-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], +; GFX8PLUS-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8PLUS-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], +; GFX8PLUS-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], +; GFX8PLUS-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8PLUS-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], +; GFX8PLUS-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], +; GFX8PLUS-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> +; GFX8PLUS-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], +; GFX8PLUS-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], +; GFX8PLUS-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; GFX8PLUS-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> +; GFX8PLUS-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; GFX8PLUS-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> +; GFX8PLUS-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT:%.*]], align 16 +; GFX8PLUS-NEXT: ret void +; +entry: + %el0 = extractelement <16 x i8> %invec, i64 0 + %el1 = extractelement <16 x i8> %invec, i64 1 + %el2 = extractelement <16 x i8> %invec, i64 2 + %el3 = extractelement <16 x i8> %invec, i64 3 + %el4 = extractelement <16 x i8> %invec, i64 4 + %el5 = extractelement <16 x i8> %invec, i64 5 + %el6 = extractelement <16 x i8> %invec, i64 6 + %el7 = extractelement <16 x i8> %invec, i64 7 + %el8 = extractelement <16 x i8> %invec, i64 8 + %el9 = extractelement <16 x i8> %invec, i64 9 + %el10 = extractelement <16 x i8> %invec, i64 10 + %el11 = extractelement <16 x i8> %invec, i64 11 + %el12 = extractelement <16 x i8> %invec, i64 12 + %el13 = extractelement <16 x i8> %invec, i64 13 + %el14 = extractelement <16 x i8> %invec, i64 14 + %el15 = extractelement <16 x i8> %invec, i64 15 + %mul0 = mul i8 %el0, 1 + %mul1 = mul i8 %el1, 1 + %mul2 = mul i8 %el2, 1 + %mul3 = mul i8 %el3, 1 + %mul4 = mul i8 %el4, 1 + %mul5 = mul i8 %el5, 1 + %mul6 = mul i8 %el6, 1 + %mul7 = mul i8 %el7, 1 + %mul8 = mul i8 %el8, 1 + %mul9 = mul i8 %el9, 1 + %mul10 = mul i8 %el10, 1 + %mul11 = mul i8 %el11, 1 + %mul12 = mul i8 %el12, 1 + %mul13 = mul i8 %el13, 1 + %mul14 = mul i8 %el14, 1 + %mul15 = mul i8 %el15, 1 + %add0 = add i8 %mul0, 1 + %add1 = add i8 %mul1, 1 + %add2 = add i8 %mul2, 1 + %add3 = add i8 %mul3, 1 + %add4 = add i8 %mul4, 1 + %add5 = add i8 %mul5, 1 + %add6 = add i8 %mul6, 1 + %add7 = add i8 %mul7, 1 + %add8 = add i8 %mul8, 1 + %add9 = add i8 %mul9, 1 + %add10 = add i8 %mul10, 1 + %add11 = add i8 %mul11, 1 + %add12 = add i8 %mul12, 1 + %add13 = add i8 %mul13, 1 + %add14 = add i8 %mul14, 1 + %add15 = add i8 %mul15, 1 + %vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0 + %vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1 + %vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2 + %vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3 + %vecins4 = insertelement <16 x i8> %vecins3, i8 %add4, i64 4 + %vecins5 = insertelement <16 x i8> %vecins4, i8 %add5, i64 5 + %vecins6 = insertelement <16 x i8> %vecins5, i8 %add6, i64 6 + %vecins7 = insertelement <16 x i8> %vecins6, i8 %add7, i64 7 + %vecins8 = insertelement <16 x i8> %vecins7, i8 %add8, i64 8 + %vecins9 = insertelement <16 x i8> %vecins8, i8 %add9, i64 9 + %vecins10 = insertelement <16 x i8> %vecins9, i8 %add10, i64 10 + %vecins11 = insertelement <16 x i8> %vecins10, i8 %add11, i64 11 + %vecins12 = insertelement <16 x i8> %vecins11, i8 %add12, i64 12 + %vecins13 = insertelement <16 x i8> %vecins12, i8 %add13, i64 13 + %vecins14 = insertelement <16 x i8> %vecins13, i8 %add14, i64 14 + %vecins15 = insertelement <16 x i8> %vecins14, i8 %add15, i64 15 + store <16 x i8> %vecins15, ptr %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX8: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll index 3b63c1e35610fe2..08c32691cf366c9 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll @@ -90,3 +90,78 @@ bb1: %o3 = insertelement <4 x half> %o2, half %c3, i64 3 ret <4 x half> %o3 } + + +define <4 x i8> @phisi8(i1 %cmp1, <4 x i8> %in1, <4 x i8> %in2) { +; CHECK-LABEL: @phisi8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[IN1:%.*]], [[ENTRY:%.*]] ], [ [[IN2:%.*]], [[BB0]] ] +; CHECK-NEXT: ret <4 x i8> [[TMP0]] +; +entry: + %a0 = extractelement <4 x i8> %in1, i64 0 + %a1 = extractelement <4 x i8> %in1, i64 1 + %a2 = extractelement <4 x i8> %in1, i64 2 + %a3 = extractelement <4 x i8> %in1, i64 3 + br i1 %cmp1, label %bb1, label %bb0 + +bb0: + %b0 = extractelement <4 x i8> %in2, i64 0 + %b1 = extractelement <4 x i8> %in2, i64 1 + %b2 = extractelement <4 x i8> %in2, i64 2 + %b3 = extractelement <4 x i8> %in2, i64 3 + br label %bb1 + +bb1: + %c0 = phi i8 [ %a0, %entry ], [ %b0, %bb0 ] + %c1 = phi i8 [ %a1, %entry ], [ %b1, %bb0 ] + %c2 = phi i8 [ %a2, %entry ], [ %b2, %bb0 ] + %c3 = phi i8 [ %a3, %entry ], [ %b3, %bb0 ] + + %o0 = insertelement <4 x i8> undef, i8 %c0, i64 0 + %o1 = insertelement <4 x i8> %o0, i8 %c1, i64 1 + %o2 = insertelement <4 x i8> %o1, i8 %c2, i64 2 + %o3 = insertelement <4 x i8> %o2, i8 %c3, i64 3 + ret <4 x i8> %o3 +} + +define <4 x i8> @phisi8_reverse(i1 %cmp1, <4 x i8> %in1, <4 x i8> %in2) { +; CHECK-LABEL: @phisi8_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[IN1:%.*]], [[ENTRY:%.*]] ], [ [[IN2:%.*]], [[BB0]] ] +; CHECK-NEXT: ret <4 x i8> [[TMP0]] +; +entry: + %a0 = extractelement <4 x i8> %in1, i64 0 + %a1 = extractelement <4 x i8> %in1, i64 1 + %a2 = extractelement <4 x i8> %in1, i64 2 + %a3 = extractelement <4 x i8> %in1, i64 3 + br i1 %cmp1, label %bb1, label %bb0 + +bb0: + %b0 = extractelement <4 x i8> %in2, i64 0 + %b1 = extractelement <4 x i8> %in2, i64 1 + %b2 = extractelement <4 x i8> %in2, i64 2 + %b3 = extractelement <4 x i8> %in2, i64 3 + br label %bb1 + +bb1: + %c3 = phi i8 [ %a3, %entry ], [ %b3, %bb0 ] + %c2 = phi i8 [ %a2, %entry ], [ %b2, %bb0 ] + %c1 = phi i8 [ %a1, %entry ], [ %b1, %bb0 ] + %c0 = phi i8 [ %a0, %entry ], [ %b0, %bb0 ] + + %o0 = insertelement <4 x i8> undef, i8 %c0, i64 0 + %o1 = insertelement <4 x i8> %o0, i8 %c1, i64 1 + %o2 = insertelement <4 x i8> %o1, i8 %c2, i64 2 + %o3 = insertelement <4 x i8> %o2, i8 %c3, i64 3 + ret <4 x i8> %o3 +} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll index aceee8840bb40e7..52a1f8bd5811d7f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -549,3 +549,217 @@ entry: ret float %add3 } + +define i8 @reduction_v4i8(<4 x i8> %a) { +; GCN-LABEL: @reduction_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[A:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <4 x i8> %a, i64 0 + %elt1 = extractelement <4 x i8> %a, i64 1 + %elt2 = extractelement <4 x i8> %a, i64 2 + %elt3 = extractelement <4 x i8> %a, i64 3 + + %add1 = add i8 %elt1, %elt0 + %add2 = add i8 %elt2, %add1 + %add3 = add i8 %elt3, %add2 + + ret i8 %add3 +} + +define i8 @reduction_v8i8(<8 x i8> %vec8) { +; GCN-LABEL: @reduction_v8i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[VEC8:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <8 x i8> %vec8, i64 0 + %elt1 = extractelement <8 x i8> %vec8, i64 1 + %elt2 = extractelement <8 x i8> %vec8, i64 2 + %elt3 = extractelement <8 x i8> %vec8, i64 3 + %elt4 = extractelement <8 x i8> %vec8, i64 4 + %elt5 = extractelement <8 x i8> %vec8, i64 5 + %elt6 = extractelement <8 x i8> %vec8, i64 6 + %elt7 = extractelement <8 x i8> %vec8, i64 7 + + %add1 = add i8 %elt1, %elt0 + %add2 = add i8 %elt2, %add1 + %add3 = add i8 %elt3, %add2 + %add4 = add i8 %elt4, %add3 + %add5 = add i8 %elt5, %add4 + %add6 = add i8 %elt6, %add5 + %add7 = add i8 %elt7, %add6 + + ret i8 %add7 +} + +define i8 @reduction_umin_v4i8(<4 x i8> %vec4) { +; GCN-LABEL: @reduction_umin_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> [[VEC4:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <4 x i8> %vec4, i64 0 + %elt1 = extractelement <4 x i8> %vec4, i64 1 + %elt2 = extractelement <4 x i8> %vec4, i64 2 + %elt3 = extractelement <4 x i8> %vec4, i64 3 + + %cmp1 = icmp ult i8 %elt1, %elt0 + %min1 = select i1 %cmp1, i8 %elt1, i8 %elt0 + %cmp2 = icmp ult i8 %elt2, %min1 + %min2 = select i1 %cmp2, i8 %elt2, i8 %min1 + %cmp3 = icmp ult i8 %elt3, %min2 + %min3 = select i1 %cmp3, i8 %elt3, i8 %min2 + + ret i8 %min3 +} + +define i8 @reduction_icmp_v8i8(<8 x i8> %vec8) { +; GCN-LABEL: @reduction_icmp_v8i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> [[VEC8:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <8 x i8> %vec8, i64 0 + %elt1 = extractelement <8 x i8> %vec8, i64 1 + %elt2 = extractelement <8 x i8> %vec8, i64 2 + %elt3 = extractelement <8 x i8> %vec8, i64 3 + %elt4 = extractelement <8 x i8> %vec8, i64 4 + %elt5 = extractelement <8 x i8> %vec8, i64 5 + %elt6 = extractelement <8 x i8> %vec8, i64 6 + %elt7 = extractelement <8 x i8> %vec8, i64 7 + + %cmp0 = icmp ult i8 %elt1, %elt0 + %min1 = select i1 %cmp0, i8 %elt1, i8 %elt0 + %cmp1 = icmp ult i8 %elt2, %min1 + %min2 = select i1 %cmp1, i8 %elt2, i8 %min1 + %cmp2 = icmp ult i8 %elt3, %min2 + %min3 = select i1 %cmp2, i8 %elt3, i8 %min2 + + %cmp3 = icmp ult i8 %elt4, %min3 + %min4 = select i1 %cmp3, i8 %elt4, i8 %min3 + %cmp4 = icmp ult i8 %elt5, %min4 + %min5 = select i1 %cmp4, i8 %elt5, i8 %min4 + + %cmp5 = icmp ult i8 %elt6, %min5 + %min6 = select i1 %cmp5, i8 %elt6, i8 %min5 + %cmp6 = icmp ult i8 %elt7, %min6 + %min7 = select i1 %cmp6, i8 %elt7, i8 %min6 + + ret i8 %min7 +} + +define i8 @reduction_smin_v16i8(<16 x i8> %vec16) { +; GCN-LABEL: @reduction_smin_v16i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> [[VEC16:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <16 x i8> %vec16, i64 0 + %elt1 = extractelement <16 x i8> %vec16, i64 1 + %elt2 = extractelement <16 x i8> %vec16, i64 2 + %elt3 = extractelement <16 x i8> %vec16, i64 3 + %elt4 = extractelement <16 x i8> %vec16, i64 4 + %elt5 = extractelement <16 x i8> %vec16, i64 5 + %elt6 = extractelement <16 x i8> %vec16, i64 6 + %elt7 = extractelement <16 x i8> %vec16, i64 7 + + %elt8 = extractelement <16 x i8> %vec16, i64 8 + %elt9 = extractelement <16 x i8> %vec16, i64 9 + %elt10 = extractelement <16 x i8> %vec16, i64 10 + %elt11 = extractelement <16 x i8> %vec16, i64 11 + %elt12 = extractelement <16 x i8> %vec16, i64 12 + %elt13 = extractelement <16 x i8> %vec16, i64 13 + %elt14 = extractelement <16 x i8> %vec16, i64 14 + %elt15 = extractelement <16 x i8> %vec16, i64 15 + + %cmp0 = icmp slt i8 %elt1, %elt0 + %min1 = select i1 %cmp0, i8 %elt1, i8 %elt0 + %cmp1 = icmp slt i8 %elt2, %min1 + %min2 = select i1 %cmp1, i8 %elt2, i8 %min1 + %cmp2 = icmp slt i8 %elt3, %min2 + %min3 = select i1 %cmp2, i8 %elt3, i8 %min2 + + %cmp3 = icmp slt i8 %elt4, %min3 + %min4 = select i1 %cmp3, i8 %elt4, i8 %min3 + %cmp4 = icmp slt i8 %elt5, %min4 + %min5 = select i1 %cmp4, i8 %elt5, i8 %min4 + + %cmp5 = icmp slt i8 %elt6, %min5 + %min6 = select i1 %cmp5, i8 %elt6, i8 %min5 + %cmp6 = icmp slt i8 %elt7, %min6 + %min7 = select i1 %cmp6, i8 %elt7, i8 %min6 + + %cmp7 = icmp slt i8 %elt8, %min7 + %min8 = select i1 %cmp7, i8 %elt8, i8 %min7 + %cmp8 = icmp slt i8 %elt9, %min8 + %min9 = select i1 %cmp8, i8 %elt9, i8 %min8 + + %cmp9 = icmp slt i8 %elt10, %min9 + %min10 = select i1 %cmp9, i8 %elt10, i8 %min9 + %cmp10 = icmp slt i8 %elt11, %min10 + %min11 = select i1 %cmp10, i8 %elt11, i8 %min10 + + %cmp11 = icmp slt i8 %elt12, %min11 + %min12 = select i1 %cmp11, i8 %elt12, i8 %min11 + %cmp12 = icmp slt i8 %elt13, %min12 + %min13 = select i1 %cmp12, i8 %elt13, i8 %min12 + + %cmp13 = icmp slt i8 %elt14, %min13 + %min14 = select i1 %cmp13, i8 %elt14, i8 %min13 + %cmp14 = icmp slt i8 %elt15, %min14 + %min15 = select i1 %cmp14, i8 %elt15, i8 %min14 + + + ret i8 %min15 +} + +define i8 @reduction_umax_v4i8(<4 x i8> %vec4) { +; GCN-LABEL: @reduction_umax_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> [[VEC4:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <4 x i8> %vec4, i64 0 + %elt1 = extractelement <4 x i8> %vec4, i64 1 + %elt2 = extractelement <4 x i8> %vec4, i64 2 + %elt3 = extractelement <4 x i8> %vec4, i64 3 + + %cmp1 = icmp ugt i8 %elt1, %elt0 + %max1 = select i1 %cmp1, i8 %elt1, i8 %elt0 + %cmp2 = icmp ugt i8 %elt2, %max1 + %max2 = select i1 %cmp2, i8 %elt2, i8 %max1 + %cmp3 = icmp ugt i8 %elt3, %max2 + %max3 = select i1 %cmp3, i8 %elt3, i8 %max2 + + ret i8 %max3 +} + +define i8 @reduction_smax_v4i8(<4 x i8> %vec4) { +; GCN-LABEL: @reduction_smax_v4i8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[VEC4:%.*]]) +; GCN-NEXT: ret i8 [[TMP0]] +; +entry: + %elt0 = extractelement <4 x i8> %vec4, i64 0 + %elt1 = extractelement <4 x i8> %vec4, i64 1 + %elt2 = extractelement <4 x i8> %vec4, i64 2 + %elt3 = extractelement <4 x i8> %vec4, i64 3 + + %cmp1 = icmp sgt i8 %elt1, %elt0 + %max1 = select i1 %cmp1, i8 %elt1, i8 %elt0 + %cmp2 = icmp sgt i8 %elt2, %max1 + %max2 = select i1 %cmp2, i8 %elt2, i8 %max1 + %cmp3 = icmp sgt i8 %elt3, %max2 + %max3 = select i1 %cmp3, i8 %elt3, i8 %max2 + + ret i8 %max3 +}