-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[IA]: Construct (de)interleave4 out of (de)interleave2 #89276
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Hassnaa Hamdi (hassnaaHamdi) Changes
Patch is 24.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89276.diff 8 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e84b63f83d91ae 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
#include <string>
#include <utility>
#include <vector>
+#include <stack>
+#include <queue>
namespace llvm {
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value*>& LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value*>& LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..35fe04d6ae0469 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -510,12 +511,57 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+ SmallVector<Value*> TempLeafNodes, LeafNodes;
+ std::map<IntrinsicInst*, bool>mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ while(!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ // iterate over extract users of deinterleave
+ for (auto UserExtract : CurrentDI->users()) {
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ // iterate over deinterleave users of extract
+ for (auto UserDI : UserExtract->users()) {
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI ||
+ Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ TempLeafNodes.push_back(UserExtract);
+ TempDeadInsts.push_back(Extract);
+ }
+ else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ }
+ // sort the deinterleaved nodes in the order that
+ // they will be extracted from the target-specific intrinsic.
+ for (unsigned I = 1; I < TempLeafNodes.size(); I+= 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+
+ for (unsigned I = 0; I < TempLeafNodes.size(); I+= 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +577,37 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+ SmallVector<Value*> TempLeafNodes, LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while(!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for(unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ TempLeafNodes.push_back(op);
+ }
+ }
+ for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
+ for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+ LeafNodes.push_back(TempLeafNodes[I]);
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +628,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..921335d8b49a0d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, SmallVector<Value*>& LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front()->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ for(unsigned I = 0; I < LeafNodes.size(); I ++) {
+ llvm::Value* CurrentExtract = LeafNodes[I];
+ Value *Newextrct = Builder.CreateExtractValue(Result, I);
+ CurrentExtract->replaceAllUsesWith(Newextrct);
+ }
+ return true;
+ }
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16431,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, SmallVector<Value*>& LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16484,12 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ if (UseScalable) {
+ SmallVector<Value *> Args(LeafNodes);
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ }
else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..1f91113221636e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value*>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value*>& LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..5218ef47c56065 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value*>& LeafNodes,
LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front()->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21066,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ for (unsigned I = 0; I < LeafNodes.size(); I ++) {
+ auto CurrentExtract = LeafNodes[I];
+ Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
+ CurrentExtract->replaceAllUsesWith(NewExtract);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21090,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value*>& LeafNodes,
StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21099,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21128,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args(LeafNodes);
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..24ed1ce7dbccce 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ SmallVector<Value*>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ SmallVector<Value*>& LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..606bb93e309e12
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x10
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w9, #1024 // =0x400
+; CHECK-NEXT: neg x10, x10
+; CHECK-NEXT: rdvl x11, #4
+; CHECK-NEXT: .LBB0_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x12, x1, x8
+; CHECK-NEXT: adds x9, x9, x10
+; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x12]
+; CHECK-NEXT: add x12, x2, x8
+; CHECK-NEXT: ld4w { z4.s - z7.s }, p0/z, [x12]
+; CHECK-NEXT: add x12, x0, x8
+; CHECK-NEXT: add x8, x8, x11
+; CHECK-NEXT: add z16.s, z4.s, z0.s
+; CHECK-NEXT: sub z17.s, z1.s, z5.s
+; CHECK-NEXT: movprfx z18, z2
+; CHECK-NEXT: lsl z18.s, p0/m, z18.s, z6.s
+; CHECK-NEXT: movprfx z19, z3
+; CHECK-NEXT: asr z19.s, p0/m, z19.s, z7.s
+; CHECK-NEXT: st4w { z16.s - z19.s }, p0, [x12]
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+ %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+ %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+ %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+ %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+ %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+ %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+ %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+ %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+ %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+ %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+ %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+ %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+ %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+ %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+ %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+ %16 = add nsw <vscale x 4 x i32> %12, %5
+ %17 = sub nsw <vscale x 4 x i32> %7, %14
+ %18 = shl <vscale x 4 x i32> %6, %13
+ %19 = ashr <vscale x 4 x i32> %8, %15
+ %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+ %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+ %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+ store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+ %index.next = add nuw i64 %index, %1
+ %21 = icmp eq i64 %index.next, 1024
+ br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..2ea14b13265c61
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHE...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
22c3ca5
to
7c19cad
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've not looked at the ISelLowering changes yet because I want to get an understanding of what we need to match first because that might dictate where the code ends up.
The patch could also do with some SVE VLS tests (see fixed-deinterleave-intrinsics.ll). I'm not asking for this work to support such types, only that tests exist to ensure they don't trigger any failure paths. This mirrors the same request I had for https://reviews.llvm.org/D146218.
Preferable the tests would be the first commit on this PR with the transformation in following commits so that it's easy to see the effect of the new code. I guess you could precommit the tests but personally I'm fine with them remaining within this PR.
while (!IeinterleaveTreeQueue.empty()) { | ||
auto node = IeinterleaveTreeQueue.front(); | ||
TempDeadInsts.push_back(node); | ||
IeinterleaveTreeQueue.pop(); | ||
for (unsigned i = 0; i < 2; i++) { | ||
auto op = node->getOperand(i); | ||
if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) { | ||
if (CurrentII->getIntrinsicID() != | ||
Intrinsic::experimental_vector_interleave2) | ||
continue; | ||
IeinterleaveTreeQueue.push(CurrentII); | ||
continue; | ||
} | ||
TempLeafNodes.push_back(op); | ||
} | ||
} | ||
for (unsigned I = 0; I < TempLeafNodes.size(); I += 2) | ||
LeafNodes.push_back(TempLeafNodes[I]); | ||
for (unsigned I = 1; I < TempLeafNodes.size(); I += 2) | ||
LeafNodes.push_back(TempLeafNodes[I]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This look rather more complex than I was expecting. I was hoping for something akin to:
match(II, interleave(interleave(Value(A), Value(C)), interleave(Value(B), Value(D)))
Have I got my logic wrong and thus the extra complexity is necessary?
define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { | ||
; CHECK-LABEL: loop_xyzt: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: cntw x10 | ||
; CHECK-NEXT: mov x8, xzr | ||
; CHECK-NEXT: mov w9, #1024 // =0x400 | ||
; CHECK-NEXT: neg x10, x10 | ||
; CHECK-NEXT: rdvl x11, #4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please can you simplify the tests to the minimum required to test the new code. In this instance there should be no need for any loop, just straight line code for a single load with interleave.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And drop unnecessary attributes like dso_local
while (!DeinterleaveTreeQueue.empty()) { | ||
auto CurrentDI = DeinterleaveTreeQueue.top(); | ||
DeinterleaveTreeQueue.pop(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My comment about the interleave parsing also applies here in that I was anticipating a simple pattern match.
inline Deinterleave2_match<LHS, RHS> m_Deinterleave2(const LHS &L, const RHS &R) { | ||
return Deinterleave2_match<LHS, RHS>(L, R); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @paulwalker-arm
I have added pattern matchers as you advised. Is this what you mean ?
I have tested that implementation locally, but I don't want to update the other files until we agree on the approach.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not quite. PatternMatch should not care about how the IR is used, that's up to the user of the interface. It exists purely to provide a convenient way to express the tree of IR to compare against. Interleave2_match
does this but Deinterleave2_match
is trying to encode logic for your specific use case, which belongs with the transformation itself (NOTE: m_ExtractValue
exists so presumably you'll want to use that when constructing your match
call).
FYI: m_Interleave2
can be simplified, see m_VecReverse
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have updated it
c55d6a0
to
443eda1
Compare
@@ -56,6 +56,8 @@ | |||
#include <cstdint> | |||
#include <iterator> | |||
#include <map> | |||
#include <queue> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this headers in the .h file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this question still needs addressed. There's no other changes in this file. is this needed? If it is needed should it be in a cpp file?
@@ -3145,6 +3147,7 @@ class TargetLoweringBase { | |||
/// \p DI is the deinterleave intrinsic. | |||
/// \p LI is the accompanying load instruction | |||
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, | |||
SmallVector<Value *> &LeafNodes, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use SmallVectorImpl
@@ -3156,6 +3159,7 @@ class TargetLoweringBase { | |||
/// \p II is the interleave intrinsic. | |||
/// \p SI is the accompanying store instruction | |||
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, | |||
SmallVector<Value *> &LeafNodes, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use SmallVectorImpl
llvm/include/llvm/IR/PatternMatch.h
Outdated
|
||
template <typename ITy> bool match(ITy *V) { | ||
auto *I = dyn_cast<IntrinsicInst>(V); | ||
if (m_Intrinsic<Intrinsic::experimental_vector_interleave2>().match(V)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
drop curly braces around single line body
llvm/include/llvm/IR/PatternMatch.h
Outdated
} | ||
|
||
// Match deinterleave tree. | ||
// if the current user of deinterelave is the last (extract instr) in the tree, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
deinterelave -> deinterleave
@@ -21064,6 +21067,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, | |||
{ResVTy, XLenTy}); | |||
VL = Constant::getAllOnesValue(XLenTy); | |||
Ops.append(Factor, PoisonValue::get(ResVTy)); | |||
Ops.append({LI->getPointerOperand(), VL}); | |||
Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops); | |||
//----------- |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drop this comment
SmallVector<Value *> Args(LeafNodes); | ||
Args.push_back(SI->getPointerOperand()); | ||
Args.push_back(VL); | ||
Builder.CreateCall(VssegNFunc, Args); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use a braced list instead of SmallVector
|
||
IeinterleaveTreeQueue.push(II); | ||
while (!IeinterleaveTreeQueue.empty()) { | ||
auto node = IeinterleaveTreeQueue.front(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Capitalize variable names.
And use auto *
instead of just auto
. Don't hide pointers in auto.
} | ||
for (unsigned I = 0; I < LeafNodes.size(); I++) { | ||
llvm::Value *CurrentExtract = LeafNodes[I]; | ||
Value *Newextrct = Builder.CreateExtractValue(Result, I); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Newextrct -> NewExtract.
define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { | ||
; CHECK-LABEL: loop_xyzt: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: ptrue p0.s | ||
; CHECK-NEXT: cntw x10 | ||
; CHECK-NEXT: mov x8, xzr | ||
; CHECK-NEXT: mov w9, #1024 // =0x400 | ||
; CHECK-NEXT: neg x10, x10 | ||
; CHECK-NEXT: rdvl x11, #4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And drop unnecessary attributes like dso_local
443eda1
to
782f8d0
Compare
782f8d0
to
bc0344e
Compare
; CHECK-NEXT: movprfx z5, z27 | ||
; CHECK-NEXT: asr z5.s, p0/m, z5.s, z1.s | ||
; CHECK-NEXT: st4w { z28.s - z31.s }, p0, [x0] | ||
; CHECK-NEXT: st4w { z2.s - z5.s }, p0, [x0, #4, mul vl] | ||
; CHECK-NEXT: ret | ||
%wide.vec = load <vscale x 32 x i32>, ptr %a, align 4 | ||
%root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@paulwalker-arm I have added the test file as a pre-commit.
@topperc Sorry Craig, I had to remove the code related to RISCV target because the solution was changed. |
@@ -56,6 +56,8 @@ | |||
#include <cstdint> | |||
#include <iterator> | |||
#include <map> | |||
#include <queue> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this question still needs addressed. There's no other changes in this file. is this needed? If it is needed should it be in a cpp file?
llvm::Value *CurrentExtract = ValuesToDeinterleave[I]; | ||
Value *NewExtract = Builder.CreateExtractValue(Result, I); | ||
CurrentExtract->replaceAllUsesWith(NewExtract); | ||
dyn_cast<Instruction>(CurrentExtract)->eraseFromParent(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use cast
since you expect it to return a non-null value.
dec7c6b
to
cf5a27c
Compare
f26b52a
to
b2288dd
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the delay. I've only looked at the interleaving side of things so far but I suspect GetDeinterleaveLeaves
might have the same issues as GetInterleaveLeaves
assuming I've not misunderstood the code.
@@ -16441,17 +16441,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, | |||
return true; | |||
} | |||
|
|||
bool GetDeinterleaveLeaves(Value *DI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The coding standard requires functions to start with lowercase.
@@ -16493,39 +16529,71 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( | |||
LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN"); | |||
else | |||
LdN = Builder.CreateCall(LdNFunc, Address, "ldN"); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this not good whitespace to separate the if block from the code that follows?
if (!match(II, m_Interleave2(m_Value(Op0), m_Value(Op1)))) | ||
return false; | ||
|
||
if (!GetInterleaveLeaves(Op0, InterleaveOps)) { | ||
InterleaveOps.push_back(Op0); | ||
} | ||
|
||
if (!GetInterleaveLeaves(Op1, InterleaveOps)) { | ||
InterleaveOps.push_back(Op1); | ||
} | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks too general to me given we require one of two specific patterns. I see a couple of problems:
-
The return operands don't look to be ordered correctly for how
st4
works?
interleave2(interleave2(Value(A), Value(B)), interleave2(Value(C), Value(D)))
with returnInterleaveOps = {A, B, C, D}
but from the previous conversation I believeinterleave4(A, B, C, D)
is the equivalent ofinterleave2(interleave2(Value(A), Value(C)), interleave2(Value(B), Value(D)))
? -
We'll miss places where
st2
orst4
can be used based purely because their operands are the result of a call to interleave2. For example,interleave2(interleave2(Value(D), Value(C)), interleave2(Value(B), Value(A)))
can still usest2
it's just the two childinterleave2
calls will remain.
Perhaps we can just look for the specific st4
pattern and if that fails we then look for the st2
pattern (which is a given because we already know II
is a call to interleave2
.
SmallVector<Value *, 4> ValuesToInterleave; | ||
GetInterleaveLeaves(II, ValuesToInterleave); | ||
unsigned Factor = ValuesToInterleave.size(); | ||
assert(Factor >= 2 && "Expected Interleave Factor >= 2"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is an assert sufficient? What happens when Factor
is 3 or 5?
|
||
%struct.xyzt = type { i32, i32, i32, i32 } | ||
|
||
define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the changes are at the IR level I think it best for the tests to use opt
rather than llc
. Also, it is better to test the two idioms (i.e. interleave4 and deinerleave4) in isolation. If you look at llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
you'll see what I mean on both counts.
b2288dd
to
a086c0d
Compare
auto *A = *(++DI1->user_begin()); | ||
auto *C = *(DI1->user_begin()); | ||
auto *B = *(++DI2->user_begin()); | ||
auto *D = *(DI2->user_begin()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this making an assumption about the order the results will be extracted? i.e. it looks like you're assuming A will be m_ExtractValue<0>
and B is m_ExtractValue<0>
. You also do this above but at least there the bad result would bogusly bail out of the transformation whereas here the emitted code will be wrong.
llvm::Value *CurrentExtract = DeinterleavedValues[I]; | ||
Value *NewExtract = Builder.CreateExtractValue(Result, I); | ||
CurrentExtract->replaceAllUsesWith(NewExtract); | ||
cast<Instruction>(CurrentExtract)->eraseFromParent(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It 's possible for DeinterleavedValues
to hold Instruction*
rather than Value*
then you'll not need such casts?
With that said, given the IR chain for for deinterleaving is non-trivial perhaps using RecursivelyDeleteTriviallyDeadInstructions
at the end of the function would simplify things because you'll not need to track DeadInsts other than the top-level extracts you're replacing here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With that said, given the IR chain for for deinterleaving is non-trivial perhaps using RecursivelyDeleteTriviallyDeadInstructions at the end of the function would simplify things because you'll not need to track DeadInsts other than the top-level extracts you're replacing here.
I resolved that comment but without recursive deletion. I think it's simpler without the recursive deletion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This time I've limited myself to the deinterleave side of things. The code structure now looks very good with my comments aiming to simplify a few things whilst bringing the existing Factor==2 code in line with your Factor==4 implementation that's much more resilient.
@@ -16585,17 +16585,87 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, | |||
return true; | |||
} | |||
|
|||
bool getDeinterleavedValues(Value *DI, | |||
SmallVectorImpl<Instruction *> &DeinterleavedValues) { | |||
if (!DI->hasNUsesOrMore(2)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the "OrMore" part important or can this just be hasNUses(2)
? I'm thinking if there are other users then we'll not be able to delete the instructions and thus might not gain anything from performing the transformation.
To further this point, the code that follows assumes the first uses will be the extracts you're looking for, which is perfectly fine and expected, but does suggest you only need to care about an exact use count.
auto *DI1 = *(Extr1->user_begin()); | ||
auto *DI2 = *(Extr2->user_begin()); | ||
|
||
if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above.
if (!Extr1 || !Extr2) | ||
return false; | ||
|
||
if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above.
@@ -16585,17 +16585,87 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, | |||
return true; | |||
} | |||
|
|||
bool getDeinterleavedValues(Value *DI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a level of complexity with this function that I think an ascii art diagram of the IR would help.
DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A; | ||
DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B; | ||
DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C; | ||
DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like the *2
is being applied to the wrong extract, which explains why you had to swap the middle values below.
It depends on how paranoid we need to be because I suppose there's the chance that an index might not be 0 or 1, so worth adding & 0x3
to limit the range?
You'll also need to have another round of null pointer checks for DeinterleavedValues[0-3]
just in case the extracts are not evenly distributed.
|
||
// Make sure that A,B,C,D match the deinterleave tree pattern | ||
if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2( | ||
m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given DI
is passed in and we can assume it's the top level deinterleave2 intrinsic I think for all these match lines you can use the scheme:
m_ExtractValue<#>(m_Deinterleave2(m_ExtractValue<#>(m_Specific(DI))))
!match(DeinterleavedValues[1], m_ExtractValue<1>(m_Deinterleave2( | ||
m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) || | ||
!match(DeinterleavedValues[2], m_ExtractValue<0>(m_Deinterleave2( | ||
m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To match my earlier comment I think the m_ExtractValue
indices need to be swapped and then the std::swap
below can be removed.
void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) { | ||
Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr; | ||
match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave))); | ||
assert(DeadDeinterleave != nullptr && "Match is expected to succeed"); | ||
match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract))); | ||
assert(DeadExtract != nullptr && "Match is expected to succeed"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the misdirect but with the new simpler code structure and assuming you agree with my "strict NUses" comment then you're now in a position to know exactly which instructions are dead and can thus maintain a dead instruction list as you did in the previous version of this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But in the last PR, I also was in a position to know exactly which instructions are dead, so what is the difference ?
This is the old code:
// These Values will not be used anymre,
// DI4 will be created instead of nested DI1 and DI2
DeadInsts.push_back(cast<Instruction>(DI1));
DeadInsts.push_back(cast<Instruction>(Extr0));
DeadInsts.push_back(cast<Instruction>(DI2));
DeadInsts.push_back(cast<Instruction>(Extr1));
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Quite possibly just my misunderstanding of the code at that time. I do think the use count checking wasn't strict though, much like with the current version, so perhaps that made me think there was more complexity round whether the instructions were actually dead.
if (getDeinterleavedValues(DI, DeinterleavedValues)) { | ||
Factor = DeinterleavedValues.size(); | ||
VTy = cast<VectorType>(DeinterleavedValues[0]->getType()); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice to unify the code a little more, especially because after inspection I think the current upstream Factor==2 code is not good, especially when compared to your new Factor==4 code. If you don't mind improve that as part of this PR then I think it worth implementing something like:
getDeinterleavedValues(DI, DeinterleavedValues....) {
if (getDeinterleaved4Values(Di, DeinterleavedValues...))
return true;
return getDeinterleaved2Values(Di, DeinterleavedValues...));
}.
and then here you'd do as you've done for the interleave handling, i.e.
if (!getDeinterleavedValues(DI, DeinterleavedValues....))
return false;
I'm assuming getDeinterleaved2Values
will be a fairly trivial function
if (Factor > 2) { | ||
// Itereate over old deinterleaved values to replace it by | ||
// the new deinterleaved values. | ||
for (unsigned I = 0; I < DeinterleavedValues.size(); I++) { | ||
Value *NewExtract = Builder.CreateExtractValue(Result, I); | ||
DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); | ||
} | ||
for (unsigned I = 0; I < DeinterleavedValues.size(); I++) | ||
deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]); | ||
return true; | ||
} | ||
DI->replaceAllUsesWith(Result); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With my getDeinterleavedValues
suggestion above I think this can be unified as well and would fix the existing issue whereby the Factor==2 code is assuming the order of the extracts.
e5154bd
to
d4ac0b5
Compare
Change-Id: I5e2613156a482dcadae3e4cfa1bacdf7f3293fe2
- InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences, and emit equivalent sve.ld4 or sve.st4 intrinsics through targets that support SV. - Tests are added for targets that support SV. Change-Id: I76ef31080ddd72b182c1a3b1752a6178dc78ea84
Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a
Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74
Change-Id: I2b2dc683dba21cdb6c35f407868a7537245c845e
…e of the order using pattern match Change-Id: I053e47d156c37cf4d7ab5b2af83c348b4210631a
add negative tests Change-Id: Id323139f11ddc4d3a22a72af84a01e98dadfe46d
d4ac0b5
to
85d159b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mostly superficial comments other than a restructuring of where dead instructions are erased.
if (Factor == 2) | ||
Result = PoisonValue::get(StructType::get(VTy, VTy)); | ||
else | ||
Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would the following work?
SmallVector<Type *, 4> ResultFieldTys(Factor, VTy);
Result = PoisonValue::get(StructType::create(ResultFieldTys));
Value *Right = PoisonValue::get(VTy); | ||
|
||
// Create multiple legal small ldN instead of a wide one. | ||
SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Redundant brackets wrapping PoisonValue::get(VTy)
?
Result = PoisonValue::get(StructType::get(VTy, VTy)); | ||
else | ||
Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy)); | ||
// Construct the wide result out of the small results. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment doesn't look correct to me and in general offers little value. The following code isn't creating a wide result it's just creating the IR necessary to return multiple results, hence populating a struct.
With that said I think you can remove this loop anyway. (See next comment relating to CreateExtractValue).
// Itereate over old deinterleaved values to replace it by | ||
// the new values. | ||
for (unsigned I = 0; I < DeinterleavedValues.size(); I++) { | ||
Value *NewExtract = Builder.CreateExtractValue(Result, I); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here you're extracting a field from a struct that you'd only just added so I thinking you can use WideValues
directly?
I'm assuming the code is like it is because of the NumLoads==1
case so I'm wondering if you can move the extract values into the else block, effectively ensuring WideValues
(or perhaps a better name is now required) is set on both side of the NumLoads
comparison and thus here you have access to the extracted values directly (i.e. for the Factor==4
there would be no need to create the intermediary struct and my two previous comments become irrelevant.
values in correct order of interleave4: A B C D. | ||
If there is a pattern matches the interleave tree above, then we can construct | ||
Interleave4 out of that pattern. This function tries to match the interleave | ||
tree pattern, and fetch the values that we want to interleave, so that in | ||
further steps they can be replaced by the output of Inteleave4. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar to before, perhaps:
Returns true if `II` is the root of an IR tree that represents a theoretical vector.interleave4 intrinsic. When true is returned `ValuesToInterleave` is populated with the inputs such an intrinsic would take (i.e. vector.interleave4(A, B, C, D)).
roots in correct order of DI4: A B C D. | ||
If there is a pattern matches the deinterleave tree above, then we can construct | ||
DI4 out of that pattern. This function tries to match the deinterleave tree | ||
pattern, and fetch the tree roots, so that in further steps they can be replaced | ||
by the output of DI4. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Up to you but perhaps:
Returns true if `DI` is the top of an IR tree that represents a theoretical vector.deinterleave4 intrinsic. When true is returned, `DeinterleavedValues` is populated with the results such an intrinsic would return (i.e. {A, B, C, D } = vector.deinterleave4(...))
if (!Extr1 || !Extr2) | ||
return false; | ||
|
||
if (!Extr1->hasNUses(1) || !Extr2->hasNUses(1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use !###->hasOneUse()
here because it's more efficient than !hasNUses(1)
.
// Make sure that the A,B,C,D are instructions of ExtractValue, | ||
// before getting the extract index |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sure A,B,C and D are ExtractValue instructions before....
Builder.CreateCall(StNFunc, {L, R, Pred, Address}); | ||
else | ||
Builder.CreateCall(StNFunc, {L, R, Address}); | ||
Builder.CreateCall(StNFunc, ValuesToInterleave); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After this point it looks like there will be some dead instructions that remain after the transformation because the caller of lowerInterleaveIntrinsicToStore
will only erase the store and first call to vector.interleave2.
You'll have to maintain a dead instruction list like you have for the deinterleaving case but because instructions are erased "uses first" I think it best for the SmallVector to be passed into lowerInterleaveIntrinsicToStore
which we they populate and then the caller should iterate across calling erase.
If you agree with this change then I think it makes sense to do the same for lowerDeinterleaveIntrinsicToLoad
so that all instruction deletion is the responsibility of the caller. The effect on other targets will be minimal because as it stands they don't have any dead instruction and so will just ignore the new parameter.
; NEON-NEXT: [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2 | ||
; NEON-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]]) | ||
; NEON-NEXT: ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This suggests you'll need to update more of the tests like you have above by adding instances of extractvalue
.
At some point we might want to restore the original behaviour but I don't think the current way these tests are written match the intent of the original work and I'm happy to loose this bit of flexibility given the bugs this PR fixes alongside the new factor4 support.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry, I don't understand the second part of the comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The original tests show the dinterleave(load())->ld2
transformation can be done when the results of the deinterleave are not explicitly extracted (via extractvalue). With your change the transformation is only possible when the result of the deinterleave are explicitly extracted.
Personally I don't see this as a problem at this stage because that's not the expected use case and so I'm happy for the tests to be updated to show the transformation as you did for deinterleave_i8_factor2
.
- remove dead instructions of interleave tree. - code improvement. Change-Id: I2020bf850c987829bf6e2255425eacee9361a980
Change-Id: I5d1cada783bb3da0a5d51fd10c98428d63db9c13
@@ -3167,7 +3168,8 @@ class TargetLoweringBase { | |||
/// \p II is the interleave intrinsic. | |||
/// \p SI is the accompanying store instruction | |||
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, | |||
StoreInst *SI) const { | |||
StoreInst *SI, | |||
SmallVectorImpl<Instruction *> &DeadInstructions) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use consistent names for both functions (i.e. either DeadInsts
or DeadInstructions
but not both).
@@ -17042,13 +17041,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( | |||
VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType()); | |||
|
|||
bool UseScalable; | |||
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) | |||
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) { | |||
DeadInsts.clear(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This assumes DeadInsts
will be empty when entering the function, which you shouldn't rely on in general. You likely need a local until you're ready to commit to the transformation and then update DeadInsts
accordingly.
|
||
// TODO: Add support for using SVE instructions with fixed types later, using | ||
// the code from lowerInterleavedLoad to obtain the correct container type. | ||
if (UseScalable && !VTy->isScalableTy()) | ||
if (UseScalable && !VTy->isScalableTy()) { | ||
DeadInsts.clear(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above, you cannot assume DeadInsts
is empty on function entry.
if (UseScalable) | ||
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); | ||
else | ||
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); | ||
// Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/Replcae/Replace/
@@ -17168,13 +17167,17 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( | |||
const DataLayout &DL = II->getModule()->getDataLayout(); | |||
|
|||
bool UseScalable; | |||
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) | |||
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) { | |||
DeadInsts.clear(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above, you cannot assume DeadInsts is empty on function entry.
|
||
// TODO: Add support for using SVE instructions with fixed types later, using | ||
// the code from lowerInterleavedStore to obtain the correct container type. | ||
if (UseScalable && !VTy->isScalableTy()) | ||
if (UseScalable && !VTy->isScalableTy()) { | ||
DeadInsts.clear(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above, you cannot assume DeadInsts is empty on function entry.
DeadInsts.push_back(cast<Instruction>(II1)); | ||
DeadInsts.push_back(cast<Instruction>(II2)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here II1
and II2
will only be populated when asserts are enabled, which means a release build will use uninitialised variables. Given you only get here when the main match
statement passes you can safely omit the assert and use II->getOperand(#)
directly.
update DeadInst vec on successful transformation only Change-Id: I920357d090960b84c46648666eca033033f209a3
420d864
to
6753d0e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few minor comments but otherwise this looks good to me.
%deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4) | ||
%7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0 | ||
%8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1 | ||
ret void |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The other tests are fine but for wide_deinterleave4
it is worth adding the add/sub combination that you have for deinterleave4
because it better shows the extracted values end up where they're suppose to be.
// Only a factor of 2 supported at present. | ||
const unsigned Factor = 2; | ||
SmallVector<Instruction *, 4> DeinterleavedValues; | ||
SmallVector<Instruction *, 4> DeInterleaveDeadInsts; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is 8 a better size for DeInterleaveDeadInsts
since that's the number of dead instructions expected for deinterleave4?
if (UseScalable) | ||
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); | ||
else | ||
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); | ||
// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 | ||
for (unsigned I = 0; I < DeinterleavedValues.size(); I++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you use Factor
here instead of DeinterleavedValues.size()
? since it's already cached.
Change-Id: I3849bda381128c5b2a4b6e19a8f3ffa20744895b
@@ -3155,8 +3155,9 @@ class TargetLoweringBase { | |||
/// | |||
/// \p DI is the deinterleave intrinsic. | |||
/// \p LI is the accompanying load instruction |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comments on these 2 functions look outdated. It would be nice to explain what DeadInsts should look like going into and out of the function given it's a reference to a vector.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
Change-Id: I47fa70b02ceae17eae5eb3bc9f7aea57fa5e4f50
…#89276) - [AArch64]: TargetLowering is updated to spot load/store (de)interleave4 like sequences using PatternMatch, and emit equivalent sve.ld4 and sve.st4 intrinsics.
equivalent sve.ld4 and sve.st4 intrinsics.