Skip to content

Commit

Permalink
[IA] Generalize the support for power-of-two (de)interleave intrinsics (
Browse files Browse the repository at this point in the history
#123863)

Previously, AArch64 used pattern matching to support
llvm.vector.(de)interleave of 2 and 4; RISC-V only supported
(de)interleave of 2.

This patch consolidates the logics in these two targets by factoring out
the common factor calculations into the InterleaveAccess Pass.
  • Loading branch information
mshockwave authored Jan 23, 2025
1 parent 1077280 commit bc74a1e
Show file tree
Hide file tree
Showing 11 changed files with 820 additions and 372 deletions.
22 changes: 9 additions & 13 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3156,27 +3156,23 @@ class TargetLoweringBase {
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave2
///
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
/// \p DeadInsts is a reference to a vector that keeps track of dead
/// instruction during transformations.
virtual bool lowerDeinterleaveIntrinsicToLoad(
IntrinsicInst *DI, LoadInst *LI,
SmallVectorImpl<Instruction *> &DeadInsts) const {
/// \p LI is the accompanying load instruction.
/// \p DeinterleaveValues contains the deinterleaved values.
virtual bool
lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
ArrayRef<Value *> DeinterleaveValues) const {
return false;
}

/// Lower an interleave intrinsic to a target specific store intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.interleave2
///
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
/// \p DeadInsts is a reference to a vector that keeps track of dead
/// instruction during transformations.
virtual bool lowerInterleaveIntrinsicToStore(
IntrinsicInst *II, StoreInst *SI,
SmallVectorImpl<Instruction *> &DeadInsts) const {
/// \p InterleaveValues contains the interleaved values.
virtual bool
lowerInterleaveIntrinsicToStore(StoreInst *SI,
ArrayRef<Value *> InterleaveValues) const {
return false;
}

Expand Down
177 changes: 169 additions & 8 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
Expand Down Expand Up @@ -478,23 +479,179 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
return true;
}

// For an (de)interleave tree like this:
//
// A C B D
// |___| |___|
// |_____|
// |
// A B C D
//
// We will get ABCD at the end while the leaf operands/results
// are ACBD, which are also what we initially collected in
// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
// to reorder them by interleaving these values.
static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
unsigned NumLeaves = SubLeaves.size();
if (NumLeaves == 2)
return;

assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);

const unsigned HalfLeaves = NumLeaves / 2;
// Visit the sub-trees.
interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));

SmallVector<Value *, 8> Buffer;
// a0 a1 a2 a3 b0 b1 b2 b3
// -> a0 b0 a1 b1 a2 b2 a3 b3
for (unsigned i = 0U; i < NumLeaves; ++i)
Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);

llvm::copy(Buffer, SubLeaves.begin());
}

static bool
getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
SmallVectorImpl<Instruction *> &DeadInsts) {
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);

// Visit with BFS
SmallVector<IntrinsicInst *, 8> Queue;
Queue.push_back(II);
while (!Queue.empty()) {
IntrinsicInst *Current = Queue.front();
Queue.erase(Queue.begin());

// All the intermediate intrinsics will be deleted.
DeadInsts.push_back(Current);

for (unsigned I = 0; I < 2; ++I) {
Value *Op = Current->getOperand(I);
if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
Queue.push_back(OpII);
continue;
}

// If this is not a perfectly balanced tree, the leaf
// result types would be different.
if (!Operands.empty() && Op->getType() != Operands.back()->getType())
return false;

Operands.push_back(Op);
}
}

const unsigned Factor = Operands.size();
// Currently we only recognize power-of-two factors.
// FIXME: should we assert here instead?
if (Factor <= 1 || !isPowerOf2_32(Factor))
return false;

interleaveLeafValues(Operands);
return true;
}

static bool
getVectorDeinterleaveFactor(IntrinsicInst *II,
SmallVectorImpl<Value *> &Results,
SmallVectorImpl<Instruction *> &DeadInsts) {
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
using namespace PatternMatch;
if (!II->hasNUses(2))
return false;

// Visit with BFS
SmallVector<IntrinsicInst *, 8> Queue;
Queue.push_back(II);
while (!Queue.empty()) {
IntrinsicInst *Current = Queue.front();
Queue.erase(Queue.begin());
assert(Current->hasNUses(2));

// All the intermediate intrinsics will be deleted from the bottom-up.
DeadInsts.insert(DeadInsts.begin(), Current);

ExtractValueInst *LHS = nullptr, *RHS = nullptr;
for (User *Usr : Current->users()) {
if (!isa<ExtractValueInst>(Usr))
return 0;

auto *EV = cast<ExtractValueInst>(Usr);
// Intermediate ExtractValue instructions will also be deleted.
DeadInsts.insert(DeadInsts.begin(), EV);
ArrayRef<unsigned> Indices = EV->getIndices();
if (Indices.size() != 1)
return false;

if (Indices[0] == 0 && !LHS)
LHS = EV;
else if (Indices[0] == 1 && !RHS)
RHS = EV;
else
return false;
}

// We have legal indices. At this point we're either going
// to continue the traversal or push the leaf values into Results.
for (ExtractValueInst *EV : {LHS, RHS}) {
// Continue the traversal. We're playing safe here and matching only the
// expression consisting of a perfectly balanced binary tree in which all
// intermediate values are only used once.
if (EV->hasOneUse() &&
match(EV->user_back(),
m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
EV->user_back()->hasNUses(2)) {
auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
Queue.push_back(EVUsr);
continue;
}

// If this is not a perfectly balanced tree, the leaf
// result types would be different.
if (!Results.empty() && EV->getType() != Results.back()->getType())
return false;

// Save the leaf value.
Results.push_back(EV);
}
}

const unsigned Factor = Results.size();
// Currently we only recognize power-of-two factors.
// FIXME: should we assert here instead?
if (Factor <= 1 || !isPowerOf2_32(Factor))
return 0;

interleaveLeafValues(Results);
return true;
}

bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));

if (!LI || !LI->hasOneUse() || !LI->isSimple())
return false;

LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
SmallVector<Value *, 8> DeinterleaveValues;
SmallVector<Instruction *, 8> DeinterleaveDeadInsts;
if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues,
DeinterleaveDeadInsts))
return false;

LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
<< " with factor = " << DeinterleaveValues.size() << "\n");

// Try and match this with target specific intrinsics.
SmallVector<Instruction *, 4> DeinterleaveDeadInsts;
if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeinterleaveDeadInsts))
if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
return false;

DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end());
// We now have a target-specific load, so delete the old one.
DeadInsts.insert(DI);
DeadInsts.insert(LI);
return true;
}
Expand All @@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
if (!SI || !SI->isSimple())
return false;

LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
SmallVector<Value *, 8> InterleaveValues;
SmallVector<Instruction *, 8> InterleaveDeadInsts;
if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
return false;

LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II
<< " with factor = " << InterleaveValues.size() << "\n");

SmallVector<Instruction *, 4> InterleaveDeadInsts;
// Try and match this with target specific intrinsics.
if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts))
if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
return false;

// We now have a target-specific store, so delete the old one.
DeadInsts.insert(SI);
DeadInsts.insert(II);
DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
return true;
}
Expand Down
Loading

0 comments on commit bc74a1e

Please sign in to comment.