diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 38ac90f0c081b3..59743dbe4d2ea4 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3156,13 +3156,11 @@ class TargetLoweringBase { /// Return true on success. Currently only supports /// llvm.vector.deinterleave2 /// - /// \p DI is the deinterleave intrinsic. - /// \p LI is the accompanying load instruction - /// \p DeadInsts is a reference to a vector that keeps track of dead - /// instruction during transformations. - virtual bool lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *DI, LoadInst *LI, - SmallVectorImpl &DeadInsts) const { + /// \p LI is the accompanying load instruction. + /// \p DeinterleaveValues contains the deinterleaved values. + virtual bool + lowerDeinterleaveIntrinsicToLoad(LoadInst *LI, + ArrayRef DeinterleaveValues) const { return false; } @@ -3170,13 +3168,11 @@ class TargetLoweringBase { /// Return true on success. Currently only supports /// llvm.vector.interleave2 /// - /// \p II is the interleave intrinsic. /// \p SI is the accompanying store instruction - /// \p DeadInsts is a reference to a vector that keeps track of dead - /// instruction during transformations. - virtual bool lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI, - SmallVectorImpl &DeadInsts) const { + /// \p InterleaveValues contains the interleaved values. + virtual bool + lowerInterleaveIntrinsicToStore(StoreInst *SI, + ArrayRef InterleaveValues) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index c6d5533fd2bae2..3f6a69ecb7d729 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -60,6 +60,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -478,6 +479,157 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } +// For an (de)interleave tree like this: +// +// A C B D +// |___| |___| +// |_____| +// | +// A B C D +// +// We will get ABCD at the end while the leaf operands/results +// are ACBD, which are also what we initially collected in +// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI +// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need +// to reorder them by interleaving these values. +static void interleaveLeafValues(MutableArrayRef SubLeaves) { + unsigned NumLeaves = SubLeaves.size(); + if (NumLeaves == 2) + return; + + assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); + + const unsigned HalfLeaves = NumLeaves / 2; + // Visit the sub-trees. + interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); + interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); + + SmallVector Buffer; + // a0 a1 a2 a3 b0 b1 b2 b3 + // -> a0 b0 a1 b1 a2 b2 a3 b3 + for (unsigned i = 0U; i < NumLeaves; ++i) + Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); + + llvm::copy(Buffer, SubLeaves.begin()); +} + +static bool +getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, + SmallVectorImpl &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); + + // Visit with BFS + SmallVector Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + + // All the intermediate intrinsics will be deleted. + DeadInsts.push_back(Current); + + for (unsigned I = 0; I < 2; ++I) { + Value *Op = Current->getOperand(I); + if (auto *OpII = dyn_cast(Op)) + if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { + Queue.push_back(OpII); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Operands.empty() && Op->getType() != Operands.back()->getType()) + return false; + + Operands.push_back(Op); + } + } + + const unsigned Factor = Operands.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return false; + + interleaveLeafValues(Operands); + return true; +} + +static bool +getVectorDeinterleaveFactor(IntrinsicInst *II, + SmallVectorImpl &Results, + SmallVectorImpl &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); + using namespace PatternMatch; + if (!II->hasNUses(2)) + return false; + + // Visit with BFS + SmallVector Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + assert(Current->hasNUses(2)); + + // All the intermediate intrinsics will be deleted from the bottom-up. + DeadInsts.insert(DeadInsts.begin(), Current); + + ExtractValueInst *LHS = nullptr, *RHS = nullptr; + for (User *Usr : Current->users()) { + if (!isa(Usr)) + return 0; + + auto *EV = cast(Usr); + // Intermediate ExtractValue instructions will also be deleted. + DeadInsts.insert(DeadInsts.begin(), EV); + ArrayRef Indices = EV->getIndices(); + if (Indices.size() != 1) + return false; + + if (Indices[0] == 0 && !LHS) + LHS = EV; + else if (Indices[0] == 1 && !RHS) + RHS = EV; + else + return false; + } + + // We have legal indices. At this point we're either going + // to continue the traversal or push the leaf values into Results. + for (ExtractValueInst *EV : {LHS, RHS}) { + // Continue the traversal. We're playing safe here and matching only the + // expression consisting of a perfectly balanced binary tree in which all + // intermediate values are only used once. + if (EV->hasOneUse() && + match(EV->user_back(), + m_Intrinsic()) && + EV->user_back()->hasNUses(2)) { + auto *EVUsr = cast(EV->user_back()); + Queue.push_back(EVUsr); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Results.empty() && EV->getType() != Results.back()->getType()) + return false; + + // Save the leaf value. + Results.push_back(EV); + } + } + + const unsigned Factor = Results.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return 0; + + interleaveLeafValues(Results); + return true; +} + bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector &DeadInsts) { LoadInst *LI = dyn_cast(DI->getOperand(0)); @@ -485,16 +637,21 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( if (!LI || !LI->hasOneUse() || !LI->isSimple()) return false; - LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + SmallVector DeinterleaveValues; + SmallVector DeinterleaveDeadInsts; + if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues, + DeinterleaveDeadInsts)) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI + << " with factor = " << DeinterleaveValues.size() << "\n"); // Try and match this with target specific intrinsics. - SmallVector DeinterleaveDeadInsts; - if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeinterleaveDeadInsts)) + if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) return false; DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end()); // We now have a target-specific load, so delete the old one. - DeadInsts.insert(DI); DeadInsts.insert(LI); return true; } @@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( if (!SI || !SI->isSimple()) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + SmallVector InterleaveValues; + SmallVector InterleaveDeadInsts; + if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts)) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II + << " with factor = " << InterleaveValues.size() << "\n"); - SmallVector InterleaveDeadInsts; // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts)) + if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) return false; // We now have a target-specific store, so delete the old one. DeadInsts.insert(SI); - DeadInsts.insert(II); DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end()); return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9a0bb73087980d..4ede1fb93fe5f2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17464,148 +17464,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -bool getDeinterleave2Values( - Value *DI, SmallVectorImpl &DeinterleavedValues, - SmallVectorImpl &DeInterleaveDeadInsts) { - if (!DI->hasNUses(2)) - return false; - auto *Extr1 = dyn_cast(*(DI->user_begin())); - auto *Extr2 = dyn_cast(*(++DI->user_begin())); - if (!Extr1 || !Extr2) - return false; - - DeinterleavedValues.resize(2); - // Place the values into the vector in the order of extraction: - DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1; - DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2; - if (!DeinterleavedValues[0] || !DeinterleavedValues[1]) - return false; - - // Make sure that the extracted values match the deinterleave tree pattern - if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) || - !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) { - LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n"); - return false; - } - // DeinterleavedValues will be replace by output of ld2 - DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(), - DeinterleavedValues.begin(), - DeinterleavedValues.end()); - return true; -} - -/* -DeinterleaveIntrinsic tree: - [DI] - / \ - [Extr<0>] [Extr<1>] - | | - [DI] [DI] - / \ / \ - [Extr<0>][Extr<1>] [Extr<0>][Extr<1>] - | | | | -roots: A C B D -roots in correct order of DI4 will be: A B C D. -Returns true if `DI` is the top of an IR tree that represents a theoretical -vector.deinterleave4 intrinsic. When true is returned, \p `DeinterleavedValues` -vector is populated with the results such an intrinsic would return: (i.e. {A, -B, C, D } = vector.deinterleave4(...)) -*/ -bool getDeinterleave4Values( - Value *DI, SmallVectorImpl &DeinterleavedValues, - SmallVectorImpl &DeInterleaveDeadInsts) { - if (!DI->hasNUses(2)) - return false; - auto *Extr1 = dyn_cast(*(DI->user_begin())); - auto *Extr2 = dyn_cast(*(++DI->user_begin())); - if (!Extr1 || !Extr2) - return false; - - if (!Extr1->hasOneUse() || !Extr2->hasOneUse()) - return false; - auto *DI1 = *(Extr1->user_begin()); - auto *DI2 = *(Extr2->user_begin()); - - if (!DI1->hasNUses(2) || !DI2->hasNUses(2)) - return false; - // Leaf nodes of the deinterleave tree: - auto *A = dyn_cast(*(DI1->user_begin())); - auto *C = dyn_cast(*(++DI1->user_begin())); - auto *B = dyn_cast(*(DI2->user_begin())); - auto *D = dyn_cast(*(++DI2->user_begin())); - // Make sure that the A,B,C and D are ExtractValue instructions before getting - // the extract index - if (!A || !B || !C || !D) - return false; - - DeinterleavedValues.resize(4); - // Place the values into the vector in the order of deinterleave4: - DeinterleavedValues[0x3 & - ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A; - DeinterleavedValues[0x3 & - ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B; - DeinterleavedValues[0x3 & - ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C; - DeinterleavedValues[0x3 & - ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D; - if (!DeinterleavedValues[0] || !DeinterleavedValues[1] || - !DeinterleavedValues[2] || !DeinterleavedValues[3]) - return false; - - // Make sure that A,B,C,D match the deinterleave tree pattern - if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2( - m_ExtractValue<0>(m_Specific(DI))))) || - !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2( - m_ExtractValue<1>(m_Specific(DI))))) || - !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2( - m_ExtractValue<0>(m_Specific(DI))))) || - !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2( - m_ExtractValue<1>(m_Specific(DI)))))) { - LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n"); - return false; - } - - // These Values will not be used anymore, - // DI4 will be created instead of nested DI1 and DI2 - DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(), - DeinterleavedValues.begin(), - DeinterleavedValues.end()); - DeInterleaveDeadInsts.push_back(cast(DI1)); - DeInterleaveDeadInsts.push_back(cast(Extr1)); - DeInterleaveDeadInsts.push_back(cast(DI2)); - DeInterleaveDeadInsts.push_back(cast(Extr2)); - - return true; -} - -bool getDeinterleavedValues( - Value *DI, SmallVectorImpl &DeinterleavedValues, - SmallVectorImpl &DeInterleaveDeadInsts) { - if (getDeinterleave4Values(DI, DeinterleavedValues, DeInterleaveDeadInsts)) - return true; - return getDeinterleave2Values(DI, DeinterleavedValues, DeInterleaveDeadInsts); -} - bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *DI, LoadInst *LI, - SmallVectorImpl &DeadInsts) const { - // Only deinterleave2 supported at present. - if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2) - return false; - - SmallVector DeinterleavedValues; - SmallVector DeInterleaveDeadInsts; - - if (!getDeinterleavedValues(DI, DeinterleavedValues, DeInterleaveDeadInsts)) { + LoadInst *LI, ArrayRef DeinterleavedValues) const { + unsigned Factor = DeinterleavedValues.size(); + if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; } - unsigned Factor = DeinterleavedValues.size(); - assert((Factor == 2 || Factor == 4) && - "Currently supported Factor is 2 or 4 only"); + VectorType *VTy = cast(DeinterleavedValues[0]->getType()); - const DataLayout &DL = DI->getModule()->getDataLayout(); + const DataLayout &DL = LI->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; @@ -17621,7 +17490,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( VTy->getElementCount().divideCoefficientBy(NumLoads)); Type *PtrTy = LI->getPointerOperandType(); - Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor, + Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor, UseScalable, LdTy, PtrTy); IRBuilder<> Builder(LI); @@ -17666,72 +17535,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); } } - DeadInsts.insert(DeadInsts.end(), DeInterleaveDeadInsts.begin(), - DeInterleaveDeadInsts.end()); return true; } -/* -InterleaveIntrinsic tree. - A C B D - \ / \ / - [II] [II] - \ / - [II] - -values in correct order of interleave4: A B C D. -Returns true if `II` is the root of an IR tree that represents a theoretical -vector.interleave4 intrinsic. When true is returned, \p `InterleavedValues` -vector is populated with the inputs such an intrinsic would take: (i.e. -vector.interleave4(A, B, C, D)). -*/ -bool getValuesToInterleave( - Value *II, SmallVectorImpl &InterleavedValues, - SmallVectorImpl &InterleaveDeadInsts) { - Value *A, *B, *C, *D; - // Try to match interleave of Factor 4 - if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)), - m_Interleave2(m_Value(B), m_Value(D))))) { - InterleavedValues.push_back(A); - InterleavedValues.push_back(B); - InterleavedValues.push_back(C); - InterleavedValues.push_back(D); - // intermediate II will not be needed anymore - InterleaveDeadInsts.push_back( - cast(cast(II)->getOperand(0))); - InterleaveDeadInsts.push_back( - cast(cast(II)->getOperand(1))); - return true; - } - - // Try to match interleave of Factor 2 - if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) { - InterleavedValues.push_back(A); - InterleavedValues.push_back(B); - return true; - } - - return false; -} - bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI, - SmallVectorImpl &DeadInsts) const { - // Only interleave2 supported at present. - if (II->getIntrinsicID() != Intrinsic::vector_interleave2) - return false; - - SmallVector InterleavedValues; - SmallVector InterleaveDeadInsts; - if (!getValuesToInterleave(II, InterleavedValues, InterleaveDeadInsts)) { + StoreInst *SI, ArrayRef InterleavedValues) const { + unsigned Factor = InterleavedValues.size(); + if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); return false; } - unsigned Factor = InterleavedValues.size(); - assert((Factor == 2 || Factor == 4) && - "Currently supported Factor is 2 or 4 only"); + VectorType *VTy = cast(InterleavedValues[0]->getType()); - const DataLayout &DL = II->getModule()->getDataLayout(); + const DataLayout &DL = SI->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) @@ -17762,9 +17578,11 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue()); auto ExtractedValues = InterleavedValues; + SmallVector StoreOperands(InterleavedValues.begin(), + InterleavedValues.end()); if (UseScalable) - InterleavedValues.push_back(Pred); - InterleavedValues.push_back(BaseAddr); + StoreOperands.push_back(Pred); + StoreOperands.push_back(BaseAddr); for (unsigned I = 0; I < NumStores; ++I) { Value *Address = BaseAddr; if (NumStores > 1) { @@ -17773,16 +17591,14 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( Value *Idx = Builder.getInt64(I * StTy->getElementCount().getKnownMinValue()); for (unsigned J = 0; J < Factor; J++) { - InterleavedValues[J] = + StoreOperands[J] = Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx); } // update the address - InterleavedValues[InterleavedValues.size() - 1] = Address; + StoreOperands[StoreOperands.size() - 1] = Address; } - Builder.CreateCall(StNFunc, InterleavedValues); + Builder.CreateCall(StNFunc, StoreOperands); } - DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(), - InterleaveDeadInsts.end()); return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 61579de50db17e..470ed2a06b706a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -714,12 +714,10 @@ class AArch64TargetLowering : public TargetLowering { unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *DI, LoadInst *LI, - SmallVectorImpl &DeadInsts) const override; + LoadInst *LI, ArrayRef DeinterleaveValues) const override; bool lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI, - SmallVectorImpl &DeadInsts) const override; + StoreInst *SI, ArrayRef InterleaveValues) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalAddScalableImmediate(int64_t) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f7efd5f437fbb1..295fd315c56daf 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22399,18 +22399,16 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, } bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *DI, LoadInst *LI, - SmallVectorImpl &DeadInsts) const { + LoadInst *LI, ArrayRef DeinterleaveValues) const { + unsigned Factor = DeinterleaveValues.size(); + if (Factor > 8) + return false; + assert(LI->isSimple()); IRBuilder<> Builder(LI); - // Only deinterleave2 supported at present. - if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2) - return false; - - const unsigned Factor = 2; + auto *ResVTy = cast(DeinterleaveValues[0]->getType()); - VectorType *ResVTy = cast(DI->getType()->getContainedType(0)); const DataLayout &DL = LI->getDataLayout(); if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(), @@ -22458,24 +22456,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } } - DI->replaceAllUsesWith(Return); + for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { + // We have to create a brand new ExtractValue to replace each + // of these old ExtractValue instructions. + Value *NewEV = + Builder.CreateExtractValue(Return, {static_cast(Idx)}); + DIV->replaceAllUsesWith(NewEV); + } return true; } bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI, - SmallVectorImpl &DeadInsts) const { - assert(SI->isSimple()); - IRBuilder<> Builder(SI); - - // Only interleave2 supported at present. - if (II->getIntrinsicID() != Intrinsic::vector_interleave2) + StoreInst *SI, ArrayRef InterleaveValues) const { + unsigned Factor = InterleaveValues.size(); + if (Factor > 8) return false; - const unsigned Factor = 2; + assert(SI->isSimple()); + IRBuilder<> Builder(SI); - VectorType *InVTy = cast(II->getArgOperand(0)->getType()); + auto *InVTy = cast(InterleaveValues[0]->getType()); const DataLayout &DL = SI->getDataLayout(); if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), @@ -22485,11 +22486,16 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(InVTy)) { + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + SI->getModule(), FixedVssegIntrIds[Factor - 2], + {InVTy, SI->getPointerOperandType(), XLenTy}); + + SmallVector Ops(InterleaveValues.begin(), + InterleaveValues.end()); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], - {InVTy, SI->getPointerOperandType(), XLenTy}, - {II->getArgOperand(0), II->getArgOperand(1), - SI->getPointerOperand(), VL}); + Ops.append({SI->getPointerOperand(), VL}); + + Builder.CreateCall(VssegNFunc, Ops); } else { static const Intrinsic::ID IntrIds[] = { Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, @@ -22514,7 +22520,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( for (unsigned i = 0; i < Factor; ++i) StoredVal = Builder.CreateIntrinsic( Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}, - {StoredVal, II->getArgOperand(i), Builder.getInt32(i)}); + {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL, ConstantInt::get(XLenTy, Log2_64(SEW))}); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 21747cc353203e..77605a3076a80a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -905,12 +905,10 @@ class RISCVTargetLowering : public TargetLowering { unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *II, LoadInst *LI, - SmallVectorImpl &DeadInsts) const override; + LoadInst *LI, ArrayRef DeinterleaveValues) const override; bool lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI, - SmallVectorImpl &DeadInsts) const override; + StoreInst *SI, ArrayRef InterleaveValues) const override; bool supportKCFIBundles() const override { return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index ede25d2c9bb07c..b4634dbf5a5e87 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -40,8 +40,12 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { ; CHECK-NEXT: vmsne.vi v8, v8, 0 ; CHECK-NEXT: ret %vec = load <32 x i1>, ptr %p - %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) - ret {<16 x i1>, <16 x i1>} %retval + %deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) + %t0 = extractvalue { <16 x i1>, <16 x i1> } %deinterleaved.results, 0 + %t1 = extractvalue { <16 x i1>, <16 x i1> } %deinterleaved.results, 1 + %res0 = insertvalue { <16 x i1>, <16 x i1> } undef, <16 x i1> %t0, 0 + %res1 = insertvalue { <16 x i1>, <16 x i1> } %res0, <16 x i1> %t1, 1 + ret {<16 x i1>, <16 x i1>} %res1 } define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) { @@ -51,8 +55,12 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) { ; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret %vec = load <32 x i8>, ptr %p - %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec) - ret {<16 x i8>, <16 x i8>} %retval + %deinterleaved.results = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec) + %t0 = extractvalue { <16 x i8>, <16 x i8> } %deinterleaved.results, 0 + %t1 = extractvalue { <16 x i8>, <16 x i8> } %deinterleaved.results, 1 + %res0 = insertvalue { <16 x i8>, <16 x i8> } undef, <16 x i8> %t0, 0 + %res1 = insertvalue { <16 x i8>, <16 x i8> } %res0, <16 x i8> %t1, 1 + ret {<16 x i8>, <16 x i8>} %res1 } ; Shouldn't be lowered to vlseg because it's unaligned @@ -67,8 +75,12 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16_align1(ptr ; CHECK-NEXT: vnsrl.wi v9, v10, 16 ; CHECK-NEXT: ret %vec = load <16 x i16>, ptr %p, align 1 - %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec) - ret {<8 x i16>, <8 x i16>} %retval + %deinterleaved.results = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec) + %t0 = extractvalue { <8 x i16>, <8 x i16> } %deinterleaved.results, 0 + %t1 = extractvalue { <8 x i16>, <8 x i16> } %deinterleaved.results, 1 + %res0 = insertvalue { <8 x i16>, <8 x i16> } undef, <8 x i16> %t0, 0 + %res1 = insertvalue { <8 x i16>, <8 x i16> } %res0, <8 x i16> %t1, 1 + ret {<8 x i16>, <8 x i16>} %res1 } define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) { @@ -78,8 +90,12 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) { ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <16 x i16>, ptr %p - %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec) - ret {<8 x i16>, <8 x i16>} %retval + %deinterleaved.results = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec) + %t0 = extractvalue { <8 x i16>, <8 x i16> } %deinterleaved.results, 0 + %t1 = extractvalue { <8 x i16>, <8 x i16> } %deinterleaved.results, 1 + %res0 = insertvalue { <8 x i16>, <8 x i16> } undef, <8 x i16> %t0, 0 + %res1 = insertvalue { <8 x i16>, <8 x i16> } %res0, <8 x i16> %t1, 1 + ret {<8 x i16>, <8 x i16>} %res1 } define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) { @@ -89,8 +105,12 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) { ; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret %vec = load <8 x i32>, ptr %p - %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec) - ret {<4 x i32>, <4 x i32>} %retval + %deinterleaved.results = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec) + %t0 = extractvalue { <4 x i32>, <4 x i32> } %deinterleaved.results, 0 + %t1 = extractvalue { <4 x i32>, <4 x i32> } %deinterleaved.results, 1 + %res0 = insertvalue { <4 x i32>, <4 x i32> } undef, <4 x i32> %t0, 0 + %res1 = insertvalue { <4 x i32>, <4 x i32> } %res0, <4 x i32> %t1, 1 + ret {<4 x i32>, <4 x i32>} %res1 } define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) { @@ -100,16 +120,14 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) { ; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret %vec = load <4 x i64>, ptr %p - %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec) - ret {<2 x i64>, <2 x i64>} %retval + %deinterleaved.results = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec) + %t0 = extractvalue { <2 x i64>, <2 x i64> } %deinterleaved.results, 0 + %t1 = extractvalue { <2 x i64>, <2 x i64> } %deinterleaved.results, 1 + %res0 = insertvalue { <2 x i64>, <2 x i64> } undef, <2 x i64> %t0, 0 + %res1 = insertvalue { <2 x i64>, <2 x i64> } %res0, <2 x i64> %t1, 1 + ret {<2 x i64>, <2 x i64>} %res1 } -declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>) -declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>) -declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>) -declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>) -declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>) - ; Floats define {<2 x bfloat>, <2 x bfloat>} @vector_deinterleave_load_v2bf16_v4bf16(ptr %p) { @@ -119,8 +137,12 @@ define {<2 x bfloat>, <2 x bfloat>} @vector_deinterleave_load_v2bf16_v4bf16(ptr ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <4 x bfloat>, ptr %p - %retval = call {<2 x bfloat>, <2 x bfloat>} @llvm.vector.deinterleave2.v4bf16(<4 x bfloat> %vec) - ret {<2 x bfloat>, <2 x bfloat>} %retval + %deinterleaved.results = call {<2 x bfloat>, <2 x bfloat>} @llvm.vector.deinterleave2.v4bf16(<4 x bfloat> %vec) + %t0 = extractvalue { <2 x bfloat>, <2 x bfloat> } %deinterleaved.results, 0 + %t1 = extractvalue { <2 x bfloat>, <2 x bfloat> } %deinterleaved.results, 1 + %res0 = insertvalue { <2 x bfloat>, <2 x bfloat> } undef, <2 x bfloat> %t0, 0 + %res1 = insertvalue { <2 x bfloat>, <2 x bfloat> } %res0, <2 x bfloat> %t1, 1 + ret {<2 x bfloat>, <2 x bfloat>} %res1 } define {<4 x bfloat>, <4 x bfloat>} @vector_deinterleave_load_v4bf16_v8bf16(ptr %p) { @@ -130,8 +152,12 @@ define {<4 x bfloat>, <4 x bfloat>} @vector_deinterleave_load_v4bf16_v8bf16(ptr ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <8 x bfloat>, ptr %p - %retval = call {<4 x bfloat>, <4 x bfloat>} @llvm.vector.deinterleave2.v8bf16(<8 x bfloat> %vec) - ret {<4 x bfloat>, <4 x bfloat>} %retval + %deinterleaved.results = call {<4 x bfloat>, <4 x bfloat>} @llvm.vector.deinterleave2.v8bf16(<8 x bfloat> %vec) + %t0 = extractvalue { <4 x bfloat>, <4 x bfloat> } %deinterleaved.results, 0 + %t1 = extractvalue { <4 x bfloat>, <4 x bfloat> } %deinterleaved.results, 1 + %res0 = insertvalue { <4 x bfloat>, <4 x bfloat> } undef, <4 x bfloat> %t0, 0 + %res1 = insertvalue { <4 x bfloat>, <4 x bfloat> } %res0, <4 x bfloat> %t1, 1 + ret {<4 x bfloat>, <4 x bfloat>} %res1 } define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) { @@ -141,8 +167,12 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) { ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <4 x half>, ptr %p - %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec) - ret {<2 x half>, <2 x half>} %retval + %deinterleaved.results = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec) + %t0 = extractvalue { <2 x half>, <2 x half> } %deinterleaved.results, 0 + %t1 = extractvalue { <2 x half>, <2 x half> } %deinterleaved.results, 1 + %res0 = insertvalue { <2 x half>, <2 x half> } undef, <2 x half> %t0, 0 + %res1 = insertvalue { <2 x half>, <2 x half> } %res0, <2 x half> %t1, 1 + ret {<2 x half>, <2 x half>} %res1 } define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) { @@ -152,8 +182,12 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) { ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <8 x half>, ptr %p - %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec) - ret {<4 x half>, <4 x half>} %retval + %deinterleaved.results = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec) + %t0 = extractvalue { <4 x half>, <4 x half> } %deinterleaved.results, 0 + %t1 = extractvalue { <4 x half>, <4 x half> } %deinterleaved.results, 1 + %res0 = insertvalue { <4 x half>, <4 x half> } undef, <4 x half> %t0, 0 + %res1 = insertvalue { <4 x half>, <4 x half> } %res0, <4 x half> %t1, 1 + ret {<4 x half>, <4 x half>} %res1 } define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p) { @@ -163,8 +197,12 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p) ; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret %vec = load <4 x float>, ptr %p - %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec) - ret {<2 x float>, <2 x float>} %retval + %deinterleaved.results = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec) + %t0 = extractvalue { <2 x float>, <2 x float> } %deinterleaved.results, 0 + %t1 = extractvalue { <2 x float>, <2 x float> } %deinterleaved.results, 1 + %res0 = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> %t0, 0 + %res1 = insertvalue { <2 x float>, <2 x float> } %res0, <2 x float> %t1, 1 + ret {<2 x float>, <2 x float>} %res1 } define {<8 x bfloat>, <8 x bfloat>} @vector_deinterleave_load_v8bf16_v16bf16(ptr %p) { @@ -174,8 +212,12 @@ define {<8 x bfloat>, <8 x bfloat>} @vector_deinterleave_load_v8bf16_v16bf16(ptr ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <16 x bfloat>, ptr %p - %retval = call {<8 x bfloat>, <8 x bfloat>} @llvm.vector.deinterleave2.v16bf16(<16 x bfloat> %vec) - ret {<8 x bfloat>, <8 x bfloat>} %retval + %deinterleaved.results = call {<8 x bfloat>, <8 x bfloat>} @llvm.vector.deinterleave2.v16bf16(<16 x bfloat> %vec) + %t0 = extractvalue { <8 x bfloat>, <8 x bfloat> } %deinterleaved.results, 0 + %t1 = extractvalue { <8 x bfloat>, <8 x bfloat> } %deinterleaved.results, 1 + %res0 = insertvalue { <8 x bfloat>, <8 x bfloat> } undef, <8 x bfloat> %t0, 0 + %res1 = insertvalue { <8 x bfloat>, <8 x bfloat> } %res0, <8 x bfloat> %t1, 1 + ret {<8 x bfloat>, <8 x bfloat>} %res1 } define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) { @@ -185,8 +227,12 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) { ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load <16 x half>, ptr %p - %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec) - ret {<8 x half>, <8 x half>} %retval + %deinterleaved.results = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec) + %t0 = extractvalue { <8 x half>, <8 x half> } %deinterleaved.results, 0 + %t1 = extractvalue { <8 x half>, <8 x half> } %deinterleaved.results, 1 + %res0 = insertvalue { <8 x half>, <8 x half> } undef, <8 x half> %t0, 0 + %res1 = insertvalue { <8 x half>, <8 x half> } %res0, <8 x half> %t1, 1 + ret {<8 x half>, <8 x half>} %res1 } define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p) { @@ -196,8 +242,12 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p) ; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret %vec = load <8 x float>, ptr %p - %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec) - ret {<4 x float>, <4 x float>} %retval + %deinterleaved.results = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec) + %t0 = extractvalue { <4 x float>, <4 x float> } %deinterleaved.results, 0 + %t1 = extractvalue { <4 x float>, <4 x float> } %deinterleaved.results, 1 + %res0 = insertvalue { <4 x float>, <4 x float> } undef, <4 x float> %t0, 0 + %res1 = insertvalue { <4 x float>, <4 x float> } %res0, <4 x float> %t1, 1 + ret {<4 x float>, <4 x float>} %res1 } define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p) { @@ -207,13 +257,75 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p ; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret %vec = load <4 x double>, ptr %p - %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec) - ret {<2 x double>, <2 x double>} %retval + %deinterleaved.results = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec) + %t0 = extractvalue { <2 x double>, <2 x double> } %deinterleaved.results, 0 + %t1 = extractvalue { <2 x double>, <2 x double> } %deinterleaved.results, 1 + %res0 = insertvalue { <2 x double>, <2 x double> } undef, <2 x double> %t0, 0 + %res1 = insertvalue { <2 x double>, <2 x double> } %res0, <2 x double> %t1, 1 + ret {<2 x double>, <2 x double>} %res1 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: ret + %vec = load <32 x i8>, ptr %p + %d0 = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec) + %d0.0 = extractvalue { <16 x i8>, <16 x i8> } %d0, 0 + %d0.1 = extractvalue { <16 x i8>, <16 x i8> } %d0, 1 + %d1 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.0) + %t0 = extractvalue { <8 x i8>, <8 x i8> } %d1, 0 + %t2 = extractvalue { <8 x i8>, <8 x i8> } %d1, 1 + %d2 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.1) + %t1 = extractvalue { <8 x i8>, <8 x i8> } %d2, 0 + %t3 = extractvalue { <8 x i8>, <8 x i8> } %d2, 1 + + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } undef, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 + %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3 } -declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>) -declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>) -declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>) -declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>) -declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>) -declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>) +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) { +; CHECK-LABEL: vector_deinterleave_load_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vlseg8e32.v v8, (a0) +; CHECK-NEXT: ret + %vec = load <16 x i32>, ptr %ptr + %d0 = call { <8 x i32>, <8 x i32> } @llvm.vector.deinterleave2.v16i32(<16 x i32> %vec) + %d0.0 = extractvalue { <8 x i32>, <8 x i32> } %d0, 0 + %d0.1 = extractvalue { <8 x i32>, <8 x i32> } %d0, 1 + %d1 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.0) + %d1.0 = extractvalue { <4 x i32>, <4 x i32> } %d1, 0 + %d1.1 = extractvalue { <4 x i32>, <4 x i32> } %d1, 1 + %d2 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.1) + %d2.0 = extractvalue { <4 x i32>, <4 x i32> } %d2, 0 + %d2.1 = extractvalue { <4 x i32>, <4 x i32> } %d2, 1 + + %d3 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.0) + %t0 = extractvalue { <2 x i32>, <2 x i32> } %d3, 0 + %t4 = extractvalue { <2 x i32>, <2 x i32> } %d3, 1 + %d4 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.1) + %t2 = extractvalue { <2 x i32>, <2 x i32> } %d4, 0 + %t6 = extractvalue { <2 x i32>, <2 x i32> } %d4, 1 + %d5 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.0) + %t1 = extractvalue { <2 x i32>, <2 x i32> } %d5, 0 + %t5 = extractvalue { <2 x i32>, <2 x i32> } %d5, 1 + %d6 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.1) + %t3 = extractvalue { <2 x i32>, <2 x i32> } %d6, 0 + %t7 = extractvalue { <2 x i32>, <2 x i32> } %d6, 1 + + %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0 + %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 + %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 + %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 + %res4 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3, <2 x i32> %t4, 4 + %res5 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res4, <2 x i32> %t5, 5 + %res6 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res5, <2 x i32> %t6, 6 + %res7 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res6, <2 x i32> %t7, 7 + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res7 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index 123e2243647953..26c3db61310342 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -80,11 +80,6 @@ define void @vector_interleave_store_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b, ptr ret void } -declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) -declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) -declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) -declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) - ; Floats define void @vector_interleave_store_v4bf16_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, ptr %p) { @@ -186,10 +181,34 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> % ret void } +define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %c) + %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %d) + %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1) + store <16 x i32> %v2, ptr %p + ret void +} -declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>) -declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>) -declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>) -declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>) -declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>) -declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>) +define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: ret + %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %e) + %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %c, <4 x i32> %g) + %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1) + + %v3 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %f) + %v4 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %d, <4 x i32> %h) + %v5 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v3, <8 x i32> %v4) + + %v6 = call <32 x i32> @llvm.vector.interleave2.v32i32(<16 x i32> %v2, <16 x i32> %v5) + store <32 x i32> %v6, ptr %p + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 34f0f9d9598c99..14f306da21dba7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -26,8 +26,12 @@ define {, } @vector_deinterleave_load_nxv16i ; CHECK-NEXT: vmsne.vi v8, v10, 0 ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv32i1( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv32i1( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv16i8_nxv32i8(ptr %p) { @@ -37,8 +41,12 @@ define {, } @vector_deinterleave_load_nxv16i ; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } ; Shouldn't be lowered to vlseg because it's unaligned @@ -51,8 +59,12 @@ define {, } @vector_deinterleave_load_nxv8i1 ; CHECK-NEXT: vnsrl.wi v10, v12, 16 ; CHECK-NEXT: ret %vec = load , ptr %p, align 1 - %retval = call {, } @llvm.vector.deinterleave2.nxv16i16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv16i16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv8i16_nxv16i16(ptr %p) { @@ -62,8 +74,12 @@ define {, } @vector_deinterleave_load_nxv8i1 ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv16i16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv16i16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv4i32_nxvv8i32(ptr %p) { @@ -73,8 +89,12 @@ define {, } @vector_deinterleave_load_nxv4i3 ; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv8i32( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv8i32( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv2i64_nxv4i64(ptr %p) { @@ -84,8 +104,12 @@ define {, } @vector_deinterleave_load_nxv2i6 ; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv4i64( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv4i64( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv4i64_nxv8i64(ptr %p) { @@ -95,8 +119,12 @@ define {, } @vector_deinterleave_load_nxv4i6 ; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv8i64( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv8i64( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } ; This shouldn't be lowered to a vlseg because EMUL * NFIELDS >= 8 @@ -150,18 +178,14 @@ define {, } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv16i64( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv16i64( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } -declare {, } @llvm.vector.deinterleave2.nxv32i1() -declare {, } @llvm.vector.deinterleave2.nxv32i8() -declare {, } @llvm.vector.deinterleave2.nxv16i16() -declare {, } @llvm.vector.deinterleave2.nxv8i32() -declare {, } @llvm.vector.deinterleave2.nxv4i64() -declare {, } @llvm.vector.deinterleave2.nxv8i64() -declare {, } @llvm.vector.deinterleave2.nxv16i64() - ; Floats define {, } @vector_deinterleave_load_nxv2bf16_nxv4bf16(ptr %p) { @@ -171,8 +195,12 @@ define {, } @vector_deinterleave_load_ ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv4bf16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv4bf16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv4bf16_nxv8bf16(ptr %p) { @@ -182,8 +210,12 @@ define {, } @vector_deinterleave_load_ ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv8bf16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv8bf16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv2f16_nxv4f16(ptr %p) { @@ -193,8 +225,12 @@ define {, } @vector_deinterleave_load_nxv2 ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv4f16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv4f16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv4f16_nxv8f16(ptr %p) { @@ -204,8 +240,12 @@ define {, } @vector_deinterleave_load_nxv4 ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv8f16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv8f16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv2f32_nxv4f32(ptr %p) { @@ -215,8 +255,12 @@ define {, } @vector_deinterleave_load_nx ; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv4f32( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv4f32( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv8bf16_nxv16bf16(ptr %p) { @@ -226,8 +270,12 @@ define {, } @vector_deinterleave_load_ ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv16bf16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv16bf16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv8f16_nxv16f16(ptr %p) { @@ -237,8 +285,12 @@ define {, } @vector_deinterleave_load_nxv8 ; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv16f16( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv16f16( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv4f32_nxv8f32(ptr %p) { @@ -248,8 +300,12 @@ define {, } @vector_deinterleave_load_nx ; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv8f32( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv8f32( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv2f64_nxv4f64(ptr %p) { @@ -259,8 +315,12 @@ define {, } @vector_deinterleave_load_ ; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv4f64( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv4f64( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 } define {, } @vector_deinterleave_load_nxv2p0_nxv4p0(ptr %p) { @@ -276,14 +336,75 @@ define {, } @vector_deinterleave_load_nxv2p0 ; RV64-NEXT: vlseg2e64.v v8, (a0) ; RV64-NEXT: ret %vec = load , ptr %p - %retval = call {, } @llvm.vector.deinterleave2.nxv4p0( %vec) - ret {, } %retval + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv4p0( %vec) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret {, } %res1 +} + +define { , , , } @vector_deinterleave_load_factor4(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call {, } @llvm.vector.deinterleave2.nxv16i8( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call {, } @llvm.vector.deinterleave2.nxv16i8( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } undef, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 } -declare {,} @llvm.vector.deinterleave2.nxv4f16() -declare {, } @llvm.vector.deinterleave2.nxv8f16() -declare {, } @llvm.vector.deinterleave2.nxv4f32() -declare {, } @llvm.vector.deinterleave2.nxv16f16() -declare {, } @llvm.vector.deinterleave2.nxv8f32() -declare {, } @llvm.vector.deinterleave2.nxv4f64() -declare {, } @llvm.vector.deinterleave2.nxv4p0() +define {, , , , , , , } @vector_deinterleave_load_factor8(ptr %ptr) { +; CHECK-LABEL: vector_deinterleave_load_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg8e32.v v8, (a0) +; CHECK-NEXT: ret + %vec = load , ptr %ptr + %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %vec) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) + %d1.0 = extractvalue { , } %d1, 0 + %d1.1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) + %d2.0 = extractvalue { , } %d2, 0 + %d2.1 = extractvalue { , } %d2, 1 + + %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) + %t0 = extractvalue { , } %d3, 0 + %t4 = extractvalue { , } %d3, 1 + %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) + %t2 = extractvalue { , } %d4, 0 + %t6 = extractvalue { , } %d4, 1 + %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) + %t1 = extractvalue { , } %d5, 0 + %t5 = extractvalue { , } %d5, 1 + %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) + %t3 = extractvalue { , } %d6, 0 + %t7 = extractvalue { , } %d6, 1 + + %res0 = insertvalue { , , , , , , , } undef, %t0, 0 + %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , , , } %res4, %t5, 5 + %res6 = insertvalue { , , , , , , , } %res5, %t6, 6 + %res7 = insertvalue { , , , , , , , } %res6, %t7, 7 + ret { , , , , , , , } %res7 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 9b78f31d399d9a..8f6365d35f885e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -121,13 +121,6 @@ define void @vector_interleave_store_nxv16i64_nxv8i64( %a, @llvm.vector.interleave2.nxv32i1(, ) -declare @llvm.vector.interleave2.nxv16i16(, ) -declare @llvm.vector.interleave2.nxv8i32(, ) -declare @llvm.vector.interleave2.nxv4i64(, ) -declare @llvm.vector.interleave2.nxv8i64(, ) -declare @llvm.vector.interleave2.nxv16i64(, ) - ; Floats define void @vector_interleave_store_nxv4bf16_nxv2bf16( %a, %b, ptr %p) { @@ -246,10 +239,34 @@ define void @vector_interleave_store_nxv4p0_nxv2p0( %a, @llvm.vector.interleave2.nxv4f16(, ) -declare @llvm.vector.interleave2.nxv8f16(, ) -declare @llvm.vector.interleave2.nxv4f32(, ) -declare @llvm.vector.interleave2.nxv16f16(, ) -declare @llvm.vector.interleave2.nxv8f32(, ) -declare @llvm.vector.interleave2.nxv4f64(, ) -declare @llvm.vector.interleave2.nxv4p0(, ) +define void @vector_interleave_store_factor4( %a, %b, %c, %d, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %v0 = call @llvm.vector.interleave2.nxv8i32( %a, %c) + %v1 = call @llvm.vector.interleave2.nxv8i32( %b, %d) + %v2 = call @llvm.vector.interleave2.nxv16i32( %v0, %v1) + store %v2, ptr %p + ret void +} + +define void @vector_interleave_store_factor8( %a, %b, %c, %d, %e, %f, %g, %h, ptr %p) { +; CHECK-LABEL: vector_interleave_store_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: ret + %v0 = call @llvm.vector.interleave2.nxv4i32( %a, %e) + %v1 = call @llvm.vector.interleave2.nxv4i32( %c, %g) + %v2 = call @llvm.vector.interleave2.nxv8i32( %v0, %v1) + + %v3 = call @llvm.vector.interleave2.nxv4i32( %b, %f) + %v4 = call @llvm.vector.interleave2.nxv4i32( %d, %h) + %v5 = call @llvm.vector.interleave2.nxv8i32( %v3, %v4) + + %v6 = call @llvm.vector.interleave2.nxv16i32( %v2, %v5) + store %v6, ptr %p + ret void +} diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll index e601ba4191459e..88d5461083541a 100644 --- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -49,6 +49,8 @@ define void @load_factor2_vscale(ptr %ptr) { ; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 ; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", , 2) [[TMP1]], i32 1) ; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { , } [[TMP5]], 0 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP5]], 1 ; RV32-NEXT: ret void ; ; RV64-LABEL: @load_factor2_vscale( @@ -57,10 +59,14 @@ define void @load_factor2_vscale(ptr %ptr) { ; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 ; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", , 2) [[TMP1]], i32 1) ; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { , } [[TMP5]], 0 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { , } [[TMP5]], 1 ; RV64-NEXT: ret void ; %interleaved.vec = load , ptr %ptr %v = call { , } @llvm.vector.deinterleave2.nxv16i32( %interleaved.vec) + %t0 = extractvalue { , } %v, 0 + %t1 = extractvalue { , } %v, 1 ret void } @@ -68,15 +74,21 @@ define void @load_factor2_vscale_as(ptr addrspace(1) %ptr) { ; RV32-LABEL: @load_factor2_vscale_as( ; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr addrspace(1) [[PTR:%.*]], align 64 ; RV32-NEXT: [[V:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[INTERLEAVED_VEC]]) +; RV32-NEXT: [[T0:%.*]] = extractvalue { , } [[V]], 0 +; RV32-NEXT: [[T1:%.*]] = extractvalue { , } [[V]], 1 ; RV32-NEXT: ret void ; ; RV64-LABEL: @load_factor2_vscale_as( ; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load , ptr addrspace(1) [[PTR:%.*]], align 64 ; RV64-NEXT: [[V:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[INTERLEAVED_VEC]]) +; RV64-NEXT: [[T0:%.*]] = extractvalue { , } [[V]], 0 +; RV64-NEXT: [[T1:%.*]] = extractvalue { , } [[V]], 1 ; RV64-NEXT: ret void ; %interleaved.vec = load , ptr addrspace(1) %ptr %v = call { , } @llvm.vector.deinterleave2.nxv16i32( %interleaved.vec) + %t0 = extractvalue { , } %v, 0 + %t1 = extractvalue { , } %v, 1 ret void } @@ -127,6 +139,52 @@ define void @load_factor4(ptr %ptr) { ret void } +define void @load_factor4_vscale(ptr %ptr) { +; RV32-LABEL: @load_factor4_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.i32(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 3 +; RV32-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 +; RV32-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 +; RV32-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 +; RV32-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor4_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.i64(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 3 +; RV64-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 +; RV64-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 +; RV64-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 +; RV64-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %interleaved.vec) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) + %t2 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + ret void +} + define void @load_factor5(ptr %ptr) { ; RV32-LABEL: @load_factor5( ; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) @@ -257,6 +315,90 @@ define void @load_factor8(ptr %ptr) { ret void } +define void @load_factor8_vscale(ptr %ptr) { +; RV32-LABEL: @load_factor8_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.i32(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , , } [[TMP7]], [[TMP8]], 3 +; RV32-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 4) +; RV32-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , , } [[TMP9]], [[TMP10]], 4 +; RV32-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 5) +; RV32-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , , } [[TMP11]], [[TMP12]], 5 +; RV32-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 6) +; RV32-NEXT: [[TMP15:%.*]] = insertvalue { , , , , , , , } [[TMP13]], [[TMP14]], 6 +; RV32-NEXT: [[TMP16:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 7) +; RV32-NEXT: [[TMP17:%.*]] = insertvalue { , , , , , , , } [[TMP15]], [[TMP16]], 7 +; RV32-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 +; RV32-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 +; RV32-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 +; RV32-NEXT: [[TMP21:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 3 +; RV32-NEXT: [[TMP22:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 4 +; RV32-NEXT: [[TMP23:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 5 +; RV32-NEXT: [[TMP24:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 6 +; RV32-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor8_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.i64(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , , } [[TMP7]], [[TMP8]], 3 +; RV64-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 4) +; RV64-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , , } [[TMP9]], [[TMP10]], 4 +; RV64-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 5) +; RV64-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , , } [[TMP11]], [[TMP12]], 5 +; RV64-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 6) +; RV64-NEXT: [[TMP15:%.*]] = insertvalue { , , , , , , , } [[TMP13]], [[TMP14]], 6 +; RV64-NEXT: [[TMP16:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 7) +; RV64-NEXT: [[TMP17:%.*]] = insertvalue { , , , , , , , } [[TMP15]], [[TMP16]], 7 +; RV64-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 +; RV64-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 +; RV64-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 +; RV64-NEXT: [[TMP21:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 3 +; RV64-NEXT: [[TMP22:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 4 +; RV64-NEXT: [[TMP23:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 5 +; RV64-NEXT: [[TMP24:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 6 +; RV64-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %interleaved.vec) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + + %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) + %d1.0 = extractvalue { , } %d1, 0 + %d1.1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) + %d2.0 = extractvalue { , } %d2, 0 + %d2.1 = extractvalue { , } %d2, 1 + + %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) + %t0 = extractvalue { , } %d3, 0 + %t1 = extractvalue { , } %d3, 1 + %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) + %t2 = extractvalue { , } %d4, 0 + %t3 = extractvalue { , } %d4, 1 + %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) + %t4 = extractvalue { , } %d5, 0 + %t5 = extractvalue { , } %d5, 1 + %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) + %t6 = extractvalue { , } %d6, 0 + %t7 = extractvalue { , } %d6, 1 + ret void +} + define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) { ; RV32-LABEL: @store_factor2( @@ -382,6 +524,30 @@ define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2 ret void } +define void @store_factor4_vscale(ptr %ptr, %v0, %v1) { +; RV32-LABEL: @store_factor4_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V0]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V1:%.*]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP3]], [[V1]], i32 3) +; RV32-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.i32(target("riscv.vector.tuple", , 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor4_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V0]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V1:%.*]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP3]], [[V1]], i32 3) +; RV64-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.i64(target("riscv.vector.tuple", , 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3) +; RV64-NEXT: ret void +; + %i0 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) + %i1 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) + %i2 = call @llvm.vector.interleave2.nxv16i8( %i0, %i1) + store %i2, ptr %ptr, align 4 + ret void +} + define void @store_factor2_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1) { ; RV32-LABEL: @store_factor2_wide( @@ -455,6 +621,44 @@ define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32 ret void } +define void @store_factor8_vscale(ptr %ptr, %v0, %v1, %v2, %v3) { +; RV32-LABEL: @store_factor8_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V2:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V0]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP3]], [[V2]], i32 3) +; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP4]], [[V1:%.*]], i32 4) +; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP5]], [[V3:%.*]], i32 5) +; RV32-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP6]], [[V1]], i32 6) +; RV32-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP7]], [[V3]], i32 7) +; RV32-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.i32(target("riscv.vector.tuple", , 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor8_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V2:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V0]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP3]], [[V2]], i32 3) +; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP4]], [[V1:%.*]], i32 4) +; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP5]], [[V3:%.*]], i32 5) +; RV64-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP6]], [[V1]], i32 6) +; RV64-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP7]], [[V3]], i32 7) +; RV64-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.i64(target("riscv.vector.tuple", , 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3) +; RV64-NEXT: ret void +; + %i0 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) + %i1 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) + %i2 = call @llvm.vector.interleave2.nxv16i8( %i0, %i1) + + %i3 = call @llvm.vector.interleave2.nxv8i8( %v2, %v3) + %i4 = call @llvm.vector.interleave2.nxv8i8( %v2, %v3) + %i5 = call @llvm.vector.interleave2.nxv16i8( %i3, %i4) + + %i6 = call @llvm.vector.interleave2.nxv32i8( %i2, %i5) + store %i6, ptr %ptr, align 4 + ret void +} + define void @load_factor2_fp128(ptr %ptr) { ; RV32-LABEL: @load_factor2_fp128( ; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16