[IA]: Construct (de)interleave4 out of (de)interleave2 #89276

hassnaaHamdi · 2024-04-18T17:38:56Z

[AArch64]: TargetLowering is updated to spot load/store (de)interleave4 like sequences using PatternMatch, and emit
equivalent sve.ld4 and sve.st4 intrinsics.

llvmbot · 2024-04-18T17:39:26Z

@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-backend-risc-v

@llvm/pr-subscribers-backend-aarch64

Author: Hassnaa Hamdi (hassnaaHamdi)

Changes

InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences, and emit equivalent sve.ld4 or
sve.st4 intrinsics through targets that support SV.
[AArch64][RISCV]: Tests are added for targets that support SV.

Patch is 24.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89276.diff

8 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+4)
(modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+75-6)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+26-11)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+2)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+25-4)
(modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+3-1)
(added) llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll (+78)
(added) llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll (+74)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e84b63f83d91ae 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <stack>
+#include <queue>
 
 namespace llvm {
 
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
   virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                SmallVector<Value*>& LeafNodes,
                                                 LoadInst *LI) const {
     return false;
   }
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                               SmallVector<Value*>& LeafNodes,
                                                StoreInst *SI) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..35fe04d6ae0469 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <utility>
+#include <queue>
 
 using namespace llvm;
 
@@ -510,12 +511,57 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
+  std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+  SmallVector<Value*> TempLeafNodes, LeafNodes;
+  std::map<IntrinsicInst*, bool>mp;
+  SmallVector<Instruction *> TempDeadInsts;
+
+  DeinterleaveTreeQueue.push(DI);
+  while(!DeinterleaveTreeQueue.empty()) {
+    auto CurrentDI = DeinterleaveTreeQueue.top();
+    DeinterleaveTreeQueue.pop();
+    TempDeadInsts.push_back(CurrentDI);
+    // iterate over extract users of deinterleave
+    for (auto UserExtract : CurrentDI->users()) {
+      Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+      if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+        continue;
+      bool IsLeaf = true;
+      // iterate over deinterleave users of extract
+      for (auto UserDI : UserExtract->users()) {
+        IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+        if (!Child_DI || 
+            Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+            continue;
+        IsLeaf = false;
+        if (mp.count(Child_DI) == 0) {
+          DeinterleaveTreeQueue.push(Child_DI);
+        }
+        continue;
+      }
+      if (IsLeaf) {
+        TempLeafNodes.push_back(UserExtract);
+        TempDeadInsts.push_back(Extract);
+      }
+      else {
+        TempDeadInsts.push_back(Extract);
+      }
+    }
+  }
+  // sort the deinterleaved nodes in the order that
+  // they will be extracted from the target-specific intrinsic.
+  for (unsigned I = 1; I < TempLeafNodes.size(); I+= 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+
+  for (unsigned I = 0; I < TempLeafNodes.size(); I+= 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
     return false;
 
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.push_back(DI);
+  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
   DeadInsts.push_back(LI);
   return true;
 }
@@ -531,14 +577,37 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+  std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+  SmallVector<Value*> TempLeafNodes, LeafNodes;
+  SmallVector<Instruction *> TempDeadInsts;
+
+  IeinterleaveTreeQueue.push(II);
+  while(!IeinterleaveTreeQueue.empty()) {
+    auto node = IeinterleaveTreeQueue.front();
+    TempDeadInsts.push_back(node);
+    IeinterleaveTreeQueue.pop();
+    for(unsigned i = 0; i < 2; i++) {
+      auto op = node->getOperand(i);
+      if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+        if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+            continue;
+        IeinterleaveTreeQueue.push(CurrentII);
+        continue;
+      }
+      TempLeafNodes.push_back(op);
+    }
+  }
+  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
-  DeadInsts.push_back(II);
+  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
   return true;
 }
 
@@ -559,7 +628,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
       // with a factor of 2.
       if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+      else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..921335d8b49a0d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, LoadInst *LI) const {
+    IntrinsicInst *DI, SmallVector<Value*>& LeafNodes, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
     return false;
 
-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  const unsigned Factor = std::max(2, (int)LeafNodes.size());
 
-  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+  VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front()->getType()) :
+                    cast<VectorType>(DI->getType()->getContainedType(0));
   const DataLayout &DL = DI->getModule()->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Result = Builder.CreateInsertValue(Result, Left, 0);
     Result = Builder.CreateInsertValue(Result, Right, 1);
   } else {
-    if (UseScalable)
+    if (UseScalable) {
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+      if (Factor == 2) {
+        DI->replaceAllUsesWith(Result);
+        return true;
+      }
+      for(unsigned I = 0; I < LeafNodes.size(); I ++) {
+        llvm::Value* CurrentExtract = LeafNodes[I];
+        Value *Newextrct = Builder.CreateExtractValue(Result, I);
+        CurrentExtract->replaceAllUsesWith(Newextrct);
+      }
+      return true;
+    }
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
@@ -16420,15 +16431,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, StoreInst *SI) const {
+    IntrinsicInst *II, SmallVector<Value*>& LeafNodes, StoreInst *SI) const {
   // Only interleave2 supported at present.
   if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
     return false;
 
-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  // leaf nodes are the nodes that will be interleaved
+  const unsigned Factor = LeafNodes.size();
 
-  VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+  VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
   const DataLayout &DL = II->getModule()->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16484,12 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
       R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
     }
 
-    if (UseScalable)
-      Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+    if (UseScalable) {
+      SmallVector<Value *> Args(LeafNodes);
+      Args.push_back(Pred);
+      Args.push_back(Address);
+      Builder.CreateCall(StNFunc, Args);
+    }
     else
       Builder.CreateCall(StNFunc, {L, R, Address});
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..1f91113221636e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                        SmallVector<Value*>& LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                       SmallVector<Value*>& LeafNodes,
                                        StoreInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..5218ef47c56065 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                           SmallVector<Value*>& LeafNodes,
                                                            LoadInst *LI) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
     return false;
 
-  unsigned Factor = 2;
+  unsigned Factor = std::max(2, (int)LeafNodes.size());
 
   VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
-  VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+  VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front()->getType()) :
+                        cast<VectorType>(DI->getType()->getContainedType(0));
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
@@ -21064,6 +21066,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
                                            {ResVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
     Ops.append(Factor, PoisonValue::get(ResVTy));
+    Ops.append({LI->getPointerOperand(), VL});
+    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+    //-----------
+    if (Factor == 2) {
+      DI->replaceAllUsesWith(Vlseg);
+      return true;
+    }
+    for (unsigned I = 0; I < LeafNodes.size(); I ++) {
+      auto CurrentExtract = LeafNodes[I];
+      Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+    }
+    return true;
   }
 
   Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21090,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
 }
 
 bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                                          SmallVector<Value*>& LeafNodes,
                                                           StoreInst *SI) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
@@ -21083,10 +21099,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
   if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
     return false;
 
-  unsigned Factor = 2;
+  unsigned Factor = LeafNodes.size();
 
   VectorType *VTy = cast<VectorType>(II->getType());
-  VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+  VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
@@ -21112,6 +21128,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
     VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
                                            {InVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
+    SmallVector<Value *> Args(LeafNodes);
+    Args.push_back(SI->getPointerOperand());
+    Args.push_back(VL);
+    Builder.CreateCall(VssegNFunc, Args);
+    return true;
   }
 
   Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..24ed1ce7dbccce 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                        SmallVector<Value*>& LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                       SmallVector<Value*>& LeafNodes,
                                        StoreInst *SI) const override;
 
   bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..606bb93e309e12
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #1024 // =0x400
+; CHECK-NEXT:    neg x10, x10
+; CHECK-NEXT:    rdvl x11, #4
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x12, x1, x8
+; CHECK-NEXT:    adds x9, x9, x10
+; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x12]
+; CHECK-NEXT:    add x12, x2, x8
+; CHECK-NEXT:    ld4w { z4.s - z7.s }, p0/z, [x12]
+; CHECK-NEXT:    add x12, x0, x8
+; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    add z16.s, z4.s, z0.s
+; CHECK-NEXT:    sub z17.s, z1.s, z5.s
+; CHECK-NEXT:    movprfx z18, z2
+; CHECK-NEXT:    lsl z18.s, p0/m, z18.s, z6.s
+; CHECK-NEXT:    movprfx z19, z3
+; CHECK-NEXT:    asr z19.s, p0/m, z19.s, z7.s
+; CHECK-NEXT:    st4w { z16.s - z19.s }, p0, [x12]
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+  %16 = add nsw <vscale x 4 x i32> %12, %5
+  %17 = sub nsw <vscale x 4 x i32> %7, %14
+  %18 = shl <vscale x 4 x i32> %6, %13
+  %19 = ashr <vscale x 4 x i32> %8, %15
+  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+  %index.next = add nuw i64 %index, %1
+  %21 = icmp eq i64 %index.next, 1024
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..2ea14b13265c61
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHE...
[truncated]

github-actions · 2024-04-18T17:42:06Z

✅ With the latest revision this PR passed the C/C++ code formatter.

paulwalker-arm

I've not looked at the ISelLowering changes yet because I want to get an understanding of what we need to match first because that might dictate where the code ends up.

The patch could also do with some SVE VLS tests (see fixed-deinterleave-intrinsics.ll). I'm not asking for this work to support such types, only that tests exist to ensure they don't trigger any failure paths. This mirrors the same request I had for https://reviews.llvm.org/D146218.

Preferable the tests would be the first commit on this PR with the transformation in following commits so that it's easy to see the effect of the new code. I guess you could precommit the tests but personally I'm fine with them remaining within this PR.

paulwalker-arm · 2024-04-24T13:25:26Z

llvm/lib/CodeGen/InterleavedAccessPass.cpp

+  while (!IeinterleaveTreeQueue.empty()) {
+    auto node = IeinterleaveTreeQueue.front();
+    TempDeadInsts.push_back(node);
+    IeinterleaveTreeQueue.pop();
+    for (unsigned i = 0; i < 2; i++) {
+      auto op = node->getOperand(i);
+      if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+        if (CurrentII->getIntrinsicID() !=
+            Intrinsic::experimental_vector_interleave2)
+          continue;
+        IeinterleaveTreeQueue.push(CurrentII);
+        continue;
+      }
+      TempLeafNodes.push_back(op);
+    }
+  }
+  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);


This look rather more complex than I was expecting. I was hoping for something akin to:

match(II, interleave(interleave(Value(A), Value(C)), interleave(Value(B), Value(D)))

Have I got my logic wrong and thus the extra complexity is necessary?

paulwalker-arm · 2024-04-24T13:30:18Z

llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll

+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #1024 // =0x400
+; CHECK-NEXT:    neg x10, x10
+; CHECK-NEXT:    rdvl x11, #4


Please can you simplify the tests to the minimum required to test the new code. In this instance there should be no need for any loop, just straight line code for a single load with interleave.

And drop unnecessary attributes like dso_local

paulwalker-arm · 2024-04-24T13:51:03Z

llvm/lib/CodeGen/InterleavedAccessPass.cpp

+  while (!DeinterleaveTreeQueue.empty()) {
+    auto CurrentDI = DeinterleaveTreeQueue.top();
+    DeinterleaveTreeQueue.pop();


My comment about the interleave parsing also applies here in that I was anticipating a simple pattern match.

hassnaaHamdi · 2024-04-29T05:06:52Z

llvm/include/llvm/IR/PatternMatch.h

+inline Deinterleave2_match<LHS, RHS> m_Deinterleave2(const LHS &L, const RHS &R) {
+  return Deinterleave2_match<LHS, RHS>(L, R);
+}
+


Hi @paulwalker-arm
I have added pattern matchers as you advised. Is this what you mean ?
I have tested that implementation locally, but I don't want to update the other files until we agree on the approach.

Not quite. PatternMatch should not care about how the IR is used, that's up to the user of the interface. It exists purely to provide a convenient way to express the tree of IR to compare against. Interleave2_match does this but Deinterleave2_match is trying to encode logic for your specific use case, which belongs with the transformation itself (NOTE: m_ExtractValue exists so presumably you'll want to use that when constructing your match call).

FYI: m_Interleave2 can be simplified, see m_VecReverse.

I have updated it

topperc · 2024-04-29T22:31:23Z

llvm/include/llvm/CodeGen/TargetLowering.h

@@ -56,6 +56,8 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
+#include <queue>


Why do we need this headers in the .h file?

I think this question still needs addressed. There's no other changes in this file. is this needed? If it is needed should it be in a cpp file?

topperc · 2024-04-29T22:31:43Z

llvm/include/llvm/CodeGen/TargetLowering.h

@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
  /// \p DI is the deinterleave intrinsic.
  /// \p LI is the accompanying load instruction
  virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                SmallVector<Value *> &LeafNodes,


Use SmallVectorImpl

topperc · 2024-04-29T22:31:52Z

llvm/include/llvm/CodeGen/TargetLowering.h

@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
  /// \p II is the interleave intrinsic.
  /// \p SI is the accompanying store instruction
  virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                               SmallVector<Value *> &LeafNodes,


Use SmallVectorImpl

topperc · 2024-04-29T22:32:56Z

llvm/include/llvm/IR/PatternMatch.h

+
+  template <typename ITy> bool match(ITy *V) {
+    auto *I = dyn_cast<IntrinsicInst>(V);
+    if (m_Intrinsic<Intrinsic::experimental_vector_interleave2>().match(V)) {


drop curly braces around single line body

topperc · 2024-04-29T22:33:17Z

llvm/include/llvm/IR/PatternMatch.h

+}
+
+// Match deinterleave tree.
+// if the current user of deinterelave is the last (extract instr) in the tree,


deinterelave -> deinterleave

topperc · 2024-04-29T22:38:19Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

@@ -21064,6 +21067,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
                                           {ResVTy, XLenTy});
    VL = Constant::getAllOnesValue(XLenTy);
    Ops.append(Factor, PoisonValue::get(ResVTy));
+    Ops.append({LI->getPointerOperand(), VL});
+    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+    //-----------


Drop this comment

topperc · 2024-04-29T22:38:33Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+    SmallVector<Value *> Args(LeafNodes);
+    Args.push_back(SI->getPointerOperand());
+    Args.push_back(VL);
+    Builder.CreateCall(VssegNFunc, Args);


use a braced list instead of SmallVector

topperc · 2024-04-29T22:39:22Z

llvm/lib/CodeGen/InterleavedAccessPass.cpp

+
+  IeinterleaveTreeQueue.push(II);
+  while (!IeinterleaveTreeQueue.empty()) {
+    auto node = IeinterleaveTreeQueue.front();


Capitalize variable names.

And use auto * instead of just auto. Don't hide pointers in auto.

topperc · 2024-04-29T22:41:02Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      }
+      for (unsigned I = 0; I < LeafNodes.size(); I++) {
+        llvm::Value *CurrentExtract = LeafNodes[I];
+        Value *Newextrct = Builder.CreateExtractValue(Result, I);


Newextrct -> NewExtract.

topperc · 2024-04-29T22:42:55Z

llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll

+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #1024 // =0x400
+; CHECK-NEXT:    neg x10, x10
+; CHECK-NEXT:    rdvl x11, #4


And drop unnecessary attributes like dso_local

hassnaaHamdi · 2024-05-16T02:24:10Z

llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll

+; CHECK-NEXT:    movprfx z5, z27
+; CHECK-NEXT:    asr z5.s, p0/m, z5.s, z1.s
+; CHECK-NEXT:    st4w { z28.s - z31.s }, p0, [x0]
+; CHECK-NEXT:    st4w { z2.s - z5.s }, p0, [x0, #4, mul vl]
 ; CHECK-NEXT:    ret
  %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
  %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)


@paulwalker-arm I have added the test file as a pre-commit.

hassnaaHamdi · 2024-05-16T02:28:23Z

@topperc Sorry Craig, I had to remove the code related to RISCV target because the solution was changed.
I resolved your comments that could be related to the current change.
Sorry about your time and thanks for your help.

topperc · 2024-05-16T20:14:45Z

llvm/include/llvm/CodeGen/TargetLowering.h

@@ -56,6 +56,8 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
+#include <queue>


I think this question still needs addressed. There's no other changes in this file. is this needed? If it is needed should it be in a cpp file?

topperc · 2024-05-16T20:16:14Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      llvm::Value *CurrentExtract = ValuesToDeinterleave[I];
+      Value *NewExtract = Builder.CreateExtractValue(Result, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+      dyn_cast<Instruction>(CurrentExtract)->eraseFromParent();


Use cast since you expect it to return a non-null value.

paulwalker-arm

Sorry for the delay. I've only looked at the interleaving side of things so far but I suspect GetDeinterleaveLeaves might have the same issues as GetInterleaveLeaves assuming I've not misunderstood the code.

paulwalker-arm · 2024-06-05T16:08:48Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -16441,17 +16441,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
  return true;
 }

+bool GetDeinterleaveLeaves(Value *DI,


The coding standard requires functions to start with lowercase.

paulwalker-arm · 2024-06-05T16:27:37Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -16493,39 +16529,71 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
        LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
      else
        LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
-


Is this not good whitespace to separate the if block from the code that follows?

paulwalker-arm · 2024-06-05T17:19:15Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (!match(II, m_Interleave2(m_Value(Op0), m_Value(Op1))))
+    return false;
+
+  if (!GetInterleaveLeaves(Op0, InterleaveOps)) {
+    InterleaveOps.push_back(Op0);
+  }
+
+  if (!GetInterleaveLeaves(Op1, InterleaveOps)) {
+    InterleaveOps.push_back(Op1);
+  }
+  return true;


This looks too general to me given we require one of two specific patterns. I see a couple of problems:

The return operands don't look to be ordered correctly for how st4 works?
interleave2(interleave2(Value(A), Value(B)), interleave2(Value(C), Value(D))) with return InterleaveOps = {A, B, C, D} but from the previous conversation I believe interleave4(A, B, C, D) is the equivalent of interleave2(interleave2(Value(A), Value(C)), interleave2(Value(B), Value(D)))?

We'll miss places where st2 or st4 can be used based purely because their operands are the result of a call to interleave2. For example, interleave2(interleave2(Value(D), Value(C)), interleave2(Value(B), Value(A))) can still use st2 it's just the two child interleave2 calls will remain.

Perhaps we can just look for the specific st4 pattern and if that fails we then look for the st2 pattern (which is a given because we already know II is a call to interleave2.

paulwalker-arm · 2024-06-05T17:33:42Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  SmallVector<Value *, 4> ValuesToInterleave;
+  GetInterleaveLeaves(II, ValuesToInterleave);
+  unsigned Factor = ValuesToInterleave.size();
+  assert(Factor >= 2 && "Expected Interleave Factor >= 2");


Is an assert sufficient? What happens when Factor is 3 or 5?

paulwalker-arm · 2024-06-06T10:11:32Z

llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll

+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {


Given the changes are at the IR level I think it best for the tests to use opt rather than llc. Also, it is better to test the two idioms (i.e. interleave4 and deinerleave4) in isolation. If you look at llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll you'll see what I mean on both counts.

paulwalker-arm · 2024-06-15T15:55:01Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  auto *A = *(++DI1->user_begin());
+  auto *C = *(DI1->user_begin());
+  auto *B = *(++DI2->user_begin());
+  auto *D = *(DI2->user_begin());


Is this making an assumption about the order the results will be extracted? i.e. it looks like you're assuming A will be m_ExtractValue<0> and B is m_ExtractValue<0>. You also do this above but at least there the bad result would bogusly bail out of the transformation whereas here the emitted code will be wrong.

paulwalker-arm · 2024-06-15T16:03:40Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      llvm::Value *CurrentExtract = DeinterleavedValues[I];
+      Value *NewExtract = Builder.CreateExtractValue(Result, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+      cast<Instruction>(CurrentExtract)->eraseFromParent();


It 's possible for DeinterleavedValues to hold Instruction* rather than Value* then you'll not need such casts?

With that said, given the IR chain for for deinterleaving is non-trivial perhaps using RecursivelyDeleteTriviallyDeadInstructions at the end of the function would simplify things because you'll not need to track DeadInsts other than the top-level extracts you're replacing here.

With that said, given the IR chain for for deinterleaving is non-trivial perhaps using RecursivelyDeleteTriviallyDeadInstructions at the end of the function would simplify things because you'll not need to track DeadInsts other than the top-level extracts you're replacing here.

I resolved that comment but without recursive deletion. I think it's simpler without the recursive deletion.

paulwalker-arm

This time I've limited myself to the deinterleave side of things. The code structure now looks very good with my comments aiming to simplify a few things whilst bringing the existing Factor==2 code in line with your Factor==4 implementation that's much more resilient.

paulwalker-arm · 2024-06-21T11:48:43Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -16585,17 +16585,87 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
  return true;
 }

+bool getDeinterleavedValues(Value *DI,
+                            SmallVectorImpl<Instruction *> &DeinterleavedValues) {
+  if (!DI->hasNUsesOrMore(2))


Is the "OrMore" part important or can this just be hasNUses(2)? I'm thinking if there are other users then we'll not be able to delete the instructions and thus might not gain anything from performing the transformation.

To further this point, the code that follows assumes the first uses will be the extracts you're looking for, which is perfectly fine and expected, but does suggest you only need to care about an exact use count.

paulwalker-arm · 2024-06-21T11:50:40Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  auto *DI1 = *(Extr1->user_begin());
+  auto *DI2 = *(Extr2->user_begin());
+
+  if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2))


paulwalker-arm · 2024-06-21T11:53:09Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (!Extr1 || !Extr2)
+    return false;
+
+  if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))


paulwalker-arm · 2024-06-21T12:04:05Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -16585,17 +16585,87 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
  return true;
 }

+bool getDeinterleavedValues(Value *DI,


There's a level of complexity with this function that I think an ascii art diagram of the IR would help.

paulwalker-arm · 2024-06-21T12:36:00Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
+  DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
+  DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
+  DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;


It looks like the *2 is being applied to the wrong extract, which explains why you had to swap the middle values below.

It depends on how paranoid we need to be because I suppose there's the chance that an index might not be 0 or 1, so worth adding & 0x3 to limit the range?

You'll also need to have another round of null pointer checks for DeinterleavedValues[0-3] just in case the extracts are not evenly distributed.

paulwalker-arm · 2024-06-21T12:57:59Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+
+  // Make sure that A,B,C,D match the deinterleave tree pattern
+  if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
+                                     m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||


Given DI is passed in and we can assume it's the top level deinterleave2 intrinsic I think for all these match lines you can use the scheme:

m_ExtractValue<#>(m_Deinterleave2(m_ExtractValue<#>(m_Specific(DI))))

paulwalker-arm · 2024-06-21T13:05:44Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      !match(DeinterleavedValues[1], m_ExtractValue<1>(m_Deinterleave2(
+                                     m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[2], m_ExtractValue<0>(m_Deinterleave2(
+                                     m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||


To match my earlier comment I think the m_ExtractValue indices need to be swapped and then the std::swap below can be removed.

paulwalker-arm · 2024-06-21T13:10:37Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
+  Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
+  match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
+  assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
+  match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
+  assert(DeadExtract != nullptr && "Match is expected to succeed");


Sorry for the misdirect but with the new simpler code structure and assuming you agree with my "strict NUses" comment then you're now in a position to know exactly which instructions are dead and can thus maintain a dead instruction list as you did in the previous version of this PR.

But in the last PR, I also was in a position to know exactly which instructions are dead, so what is the difference ?
This is the old code:

// These Values will not be used anymre, // DI4 will be created instead of nested DI1 and DI2 DeadInsts.push_back(cast<Instruction>(DI1)); DeadInsts.push_back(cast<Instruction>(Extr0)); DeadInsts.push_back(cast<Instruction>(DI2)); DeadInsts.push_back(cast<Instruction>(Extr1));

Quite possibly just my misunderstanding of the code at that time. I do think the use count checking wasn't strict though, much like with the current version, so perhaps that made me think there was more complexity round whether the instructions were actually dead.

paulwalker-arm · 2024-06-21T13:37:13Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (getDeinterleavedValues(DI, DeinterleavedValues)) {
+    Factor = DeinterleavedValues.size();
+    VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+  }


It would be nice to unify the code a little more, especially because after inspection I think the current upstream Factor==2 code is not good, especially when compared to your new Factor==4 code. If you don't mind improve that as part of this PR then I think it worth implementing something like:

getDeinterleavedValues(DI, DeinterleavedValues....) { if (getDeinterleaved4Values(Di, DeinterleavedValues...)) return true; return getDeinterleaved2Values(Di, DeinterleavedValues...)); }.

and then here you'd do as you've done for the interleave handling, i.e.

if (!getDeinterleavedValues(DI, DeinterleavedValues....)) return false;

I'm assuming getDeinterleaved2Values will be a fairly trivial function

paulwalker-arm · 2024-06-21T13:42:07Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (Factor > 2) {
+    // Itereate over old deinterleaved values to replace it by
+    // the new deinterleaved values.
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+      Value *NewExtract = Builder.CreateExtractValue(Result, I);
+      DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
+    }
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
+      deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
+    return true;
+  }
  DI->replaceAllUsesWith(Result);


With my getDeinterleavedValues suggestion above I think this can be unified as well and would fix the existing issue whereby the Factor==2 code is assuming the order of the extracts.

Change-Id: I5e2613156a482dcadae3e4cfa1bacdf7f3293fe2

- InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences, and emit equivalent sve.ld4 or sve.st4 intrinsics through targets that support SV. - Tests are added for targets that support SV. Change-Id: I76ef31080ddd72b182c1a3b1752a6178dc78ea84

Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a

Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74

Change-Id: I2b2dc683dba21cdb6c35f407868a7537245c845e

…e of the order using pattern match Change-Id: I053e47d156c37cf4d7ab5b2af83c348b4210631a

add negative tests Change-Id: Id323139f11ddc4d3a22a72af84a01e98dadfe46d

paulwalker-arm

Mostly superficial comments other than a restructuring of where dead instructions are erased.

paulwalker-arm · 2024-07-26T15:51:00Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+    if (Factor == 2)
+      Result = PoisonValue::get(StructType::get(VTy, VTy));
+    else
+      Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));


Would the following work?

SmallVector<Type *, 4> ResultFieldTys(Factor, VTy); Result = PoisonValue::get(StructType::create(ResultFieldTys));

paulwalker-arm · 2024-07-26T15:54:02Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

-    Value *Right = PoisonValue::get(VTy);
-
+    // Create multiple legal small ldN instead of a wide one.
+    SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));


Redundant brackets wrapping PoisonValue::get(VTy)?

paulwalker-arm · 2024-07-30T15:14:33Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      Result = PoisonValue::get(StructType::get(VTy, VTy));
+    else
+      Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
+    // Construct the wide result out of the small results.


The comment doesn't look correct to me and in general offers little value. The following code isn't creating a wide result it's just creating the IR necessary to return multiple results, hence populating a struct.

With that said I think you can remove this loop anyway. (See next comment relating to CreateExtractValue).

paulwalker-arm · 2024-07-30T15:29:35Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  // Itereate over old deinterleaved values to replace it by
+  // the new values.
+  for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+    Value *NewExtract = Builder.CreateExtractValue(Result, I);


Here you're extracting a field from a struct that you'd only just added so I thinking you can use WideValues directly?

I'm assuming the code is like it is because of the NumLoads==1 case so I'm wondering if you can move the extract values into the else block, effectively ensuring WideValues (or perhaps a better name is now required) is set on both side of the NumLoads comparison and thus here you have access to the extracted values directly (i.e. for the Factor==4 there would be no need to create the intermediary struct and my two previous comments become irrelevant.

paulwalker-arm · 2024-07-30T15:47:38Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+values in correct order of interleave4: A B C D.
+If there is a pattern matches the interleave tree above, then we can construct
+Interleave4 out of that pattern. This function tries to match the interleave
+tree pattern, and fetch the values that we want to interleave, so that in
+further steps they can be replaced by the output of Inteleave4.


Similar to before, perhaps:

Returns true if `II` is the root of an IR tree that represents a theoretical vector.interleave4 intrinsic. When true is returned `ValuesToInterleave` is populated with the inputs such an intrinsic would take (i.e. vector.interleave4(A, B, C, D)).

paulwalker-arm · 2024-07-30T15:54:44Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+roots in correct order of DI4: A B C D.
+If there is a pattern matches the deinterleave tree above, then we can construct
+DI4 out of that pattern. This function tries to match the deinterleave tree
+pattern, and fetch the tree roots, so that in further steps they can be replaced
+by the output of DI4.


Up to you but perhaps:

Returns true if `DI` is the top of an IR tree that represents a theoretical vector.deinterleave4 intrinsic. When true is returned, `DeinterleavedValues` is populated with the results such an intrinsic would return (i.e. {A, B, C, D } = vector.deinterleave4(...))

paulwalker-arm · 2024-07-30T16:08:22Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (!Extr1 || !Extr2)
+    return false;
+
+  if (!Extr1->hasNUses(1) || !Extr2->hasNUses(1))


Please use !###->hasOneUse() here because it's more efficient than !hasNUses(1).

paulwalker-arm · 2024-07-30T16:10:53Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  // Make sure that the A,B,C,D are instructions of ExtractValue,
+  // before getting the extract index


Make sure A,B,C and D are ExtractValue instructions before....

paulwalker-arm · 2024-07-30T16:37:56Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

-      Builder.CreateCall(StNFunc, {L, R, Pred, Address});
-    else
-      Builder.CreateCall(StNFunc, {L, R, Address});
+    Builder.CreateCall(StNFunc, ValuesToInterleave);


After this point it looks like there will be some dead instructions that remain after the transformation because the caller of lowerInterleaveIntrinsicToStore will only erase the store and first call to vector.interleave2.

You'll have to maintain a dead instruction list like you have for the deinterleaving case but because instructions are erased "uses first" I think it best for the SmallVector to be passed into lowerInterleaveIntrinsicToStore which we they populate and then the caller should iterate across calling erase.

If you agree with this change then I think it makes sense to do the same for lowerDeinterleaveIntrinsicToLoad so that all instruction deletion is the responsibility of the caller. The effect on other targets will be minimal because as it stands they don't have any dead instruction and so will just ignore the new parameter.

paulwalker-arm · 2024-07-30T16:52:07Z

llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll

+; NEON-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; NEON-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]


This suggests you'll need to update more of the tests like you have above by adding instances of extractvalue.

At some point we might want to restore the original behaviour but I don't think the current way these tests are written match the intent of the original work and I'm happy to loose this bit of flexibility given the bugs this PR fixes alongside the new factor4 support.

sorry, I don't understand the second part of the comment.

The original tests show the dinterleave(load())->ld2 transformation can be done when the results of the deinterleave are not explicitly extracted (via extractvalue). With your change the transformation is only possible when the result of the deinterleave are explicitly extracted.

Personally I don't see this as a problem at this stage because that's not the expected use case and so I'm happy for the tests to be updated to show the transformation as you did for deinterleave_i8_factor2.

- remove dead instructions of interleave tree. - code improvement. Change-Id: I2020bf850c987829bf6e2255425eacee9361a980

Change-Id: I5d1cada783bb3da0a5d51fd10c98428d63db9c13

paulwalker-arm · 2024-08-07T12:45:01Z

llvm/include/llvm/CodeGen/TargetLowering.h

@@ -3167,7 +3168,8 @@ class TargetLoweringBase {
  /// \p II is the interleave intrinsic.
  /// \p SI is the accompanying store instruction
  virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                               StoreInst *SI) const {
+                                               StoreInst *SI,
+                                               SmallVectorImpl<Instruction *> &DeadInstructions) const {


Please use consistent names for both functions (i.e. either DeadInsts or DeadInstructions but not both).

paulwalker-arm · 2024-08-07T12:55:33Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -17042,13 +17041,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
  VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());

  bool UseScalable;
-  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
+    DeadInsts.clear();


This assumes DeadInsts will be empty when entering the function, which you shouldn't rely on in general. You likely need a local until you're ready to commit to the transformation and then update DeadInsts accordingly.

paulwalker-arm · 2024-08-07T12:56:08Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp


  // TODO: Add support for using SVE instructions with fixed types later, using
  // the code from lowerInterleavedLoad to obtain the correct container type.
-  if (UseScalable && !VTy->isScalableTy())
+  if (UseScalable && !VTy->isScalableTy()) {
+    DeadInsts.clear();


As above, you cannot assume DeadInsts is empty on function entry.

paulwalker-arm · 2024-08-07T12:57:49Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

    if (UseScalable)
      Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
    else
      Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
+    // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4


s/Replcae/Replace/

paulwalker-arm · 2024-08-07T13:24:31Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -17168,13 +17167,17 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
  const DataLayout &DL = II->getModule()->getDataLayout();

  bool UseScalable;
-  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
+    DeadInsts.clear();


As above, you cannot assume DeadInsts is empty on function entry.

paulwalker-arm · 2024-08-07T13:24:38Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp


  // TODO: Add support for using SVE instructions with fixed types later, using
  // the code from lowerInterleavedStore to obtain the correct container type.
-  if (UseScalable && !VTy->isScalableTy())
+  if (UseScalable && !VTy->isScalableTy()) {
+    DeadInsts.clear();


As above, you cannot assume DeadInsts is empty on function entry.

paulwalker-arm · 2024-08-07T13:34:33Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+    DeadInsts.push_back(cast<Instruction>(II1));
+    DeadInsts.push_back(cast<Instruction>(II2));


Here II1 and II2 will only be populated when asserts are enabled, which means a release build will use uninitialised variables. Given you only get here when the main match statement passes you can safely omit the assert and use II->getOperand(#) directly.

update DeadInst vec on successful transformation only Change-Id: I920357d090960b84c46648666eca033033f209a3

paulwalker-arm

A few minor comments but otherwise this looks good to me.

paulwalker-arm · 2024-08-08T11:59:55Z

llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll

+  %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
+  ret void


The other tests are fine but for wide_deinterleave4 it is worth adding the add/sub combination that you have for deinterleave4 because it better shows the extracted values end up where they're suppose to be.

paulwalker-arm · 2024-08-08T12:11:27Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  SmallVector<Instruction *, 4> DeinterleavedValues;
+  SmallVector<Instruction *, 4> DeInterleaveDeadInsts;


Is 8 a better size for DeInterleaveDeadInsts since that's the number of dead instructions expected for deinterleave4?

paulwalker-arm · 2024-08-08T12:28:44Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

    if (UseScalable)
      Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
    else
      Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
+    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {


Can you use Factor here instead of DeinterleavedValues.size()? since it's already cached.

Change-Id: I3849bda381128c5b2a4b6e19a8f3ffa20744895b

MDevereau · 2024-08-09T09:41:13Z

llvm/include/llvm/CodeGen/TargetLowering.h

@@ -3155,8 +3155,9 @@ class TargetLoweringBase {
  ///
  /// \p DI is the deinterleave intrinsic.
  /// \p LI is the accompanying load instruction


The comments on these 2 functions look outdated. It would be nice to explain what DeadInsts should look like going into and out of the function given it's a reference to a vector.

Change-Id: I47fa70b02ceae17eae5eb3bc9f7aea57fa5e4f50

…#89276) - [AArch64]: TargetLowering is updated to spot load/store (de)interleave4 like sequences using PatternMatch, and emit equivalent sve.ld4 and sve.st4 intrinsics.

llvmbot added backend:AArch64 backend:RISC-V labels Apr 18, 2024

hassnaaHamdi force-pushed the lower_intrlv4 branch from 22c3ca5 to 7c19cad Compare April 18, 2024 17:42

hassnaaHamdi requested review from paulwalker-arm, CarolineConcatto and topperc April 18, 2024 17:54

paulwalker-arm reviewed Apr 24, 2024

View reviewed changes

llvmbot added the llvm:ir label Apr 29, 2024

hassnaaHamdi commented Apr 29, 2024

View reviewed changes

hassnaaHamdi force-pushed the lower_intrlv4 branch from c55d6a0 to 443eda1 Compare April 29, 2024 05:10

topperc reviewed Apr 29, 2024

View reviewed changes

hassnaaHamdi force-pushed the lower_intrlv4 branch from 443eda1 to 782f8d0 Compare May 1, 2024 17:10

hassnaaHamdi force-pushed the lower_intrlv4 branch from 782f8d0 to bc0344e Compare May 16, 2024 02:19

hassnaaHamdi commented May 16, 2024

View reviewed changes

topperc reviewed May 16, 2024

View reviewed changes

hassnaaHamdi force-pushed the lower_intrlv4 branch 2 times, most recently from dec7c6b to cf5a27c Compare May 20, 2024 15:26

hassnaaHamdi force-pushed the lower_intrlv4 branch 2 times, most recently from f26b52a to b2288dd Compare June 3, 2024 14:31

paulwalker-arm reviewed Jun 5, 2024

View reviewed changes

paulwalker-arm reviewed Jun 6, 2024

View reviewed changes

hassnaaHamdi force-pushed the lower_intrlv4 branch from b2288dd to a086c0d Compare June 12, 2024 13:01

llvmbot added the llvm:transforms label Jun 12, 2024

paulwalker-arm reviewed Jun 15, 2024

View reviewed changes

paulwalker-arm reviewed Jun 21, 2024

View reviewed changes

hassnaaHamdi force-pushed the lower_intrlv4 branch from e5154bd to d4ac0b5 Compare July 21, 2024 21:25

[AArch64][Interleave]: Add test precommit

08a6494

Change-Id: I5e2613156a482dcadae3e4cfa1bacdf7f3293fe2

hassnaaHamdi added 6 commits July 21, 2024 21:26

[PatternMatch]: Add m_Interleave and m_Deinterleave matchers.

884e12a

Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a

[AArch64]: Use PatternMatch to spot (de)interleave accesses

73f187f

Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74

add DeadCodeElim pass to the RUN line

68e9c30

Change-Id: I2b2dc683dba21cdb6c35f407868a7537245c845e

Fix assmuption of the extraction order, make it generic then make sur…

94d537b

…e of the order using pattern match Change-Id: I053e47d156c37cf4d7ab5b2af83c348b4210631a

Unify the logic for ld2 and ld4d ld4

85d159b

add negative tests Change-Id: Id323139f11ddc4d3a22a72af84a01e98dadfe46d

hassnaaHamdi force-pushed the lower_intrlv4 branch from d4ac0b5 to 85d159b Compare July 21, 2024 21:31

paulwalker-arm reviewed Jul 30, 2024

View reviewed changes

hassnaaHamdi added 2 commits August 7, 2024 10:16

Resolve review comments.

7b13365

- remove dead instructions of interleave tree. - code improvement. Change-Id: I2020bf850c987829bf6e2255425eacee9361a980

code format

b67e3f7

Change-Id: I5d1cada783bb3da0a5d51fd10c98428d63db9c13

paulwalker-arm reviewed Aug 7, 2024

View reviewed changes

resolve review comments

6753d0e

update DeadInst vec on successful transformation only Change-Id: I920357d090960b84c46648666eca033033f209a3

hassnaaHamdi force-pushed the lower_intrlv4 branch from 420d864 to 6753d0e Compare August 8, 2024 07:36

paulwalker-arm approved these changes Aug 8, 2024

View reviewed changes

improve test of deinterleaving wide vector

b1c4f32

Change-Id: I3849bda381128c5b2a4b6e19a8f3ffa20744895b

MDevereau reviewed Aug 9, 2024

View reviewed changes

resolve code-review: document function params

03ada93

Change-Id: I47fa70b02ceae17eae5eb3bc9f7aea57fa5e4f50

hassnaaHamdi merged commit 3176f25 into llvm:main Aug 12, 2024
6 of 8 checks passed

mshockwave mentioned this pull request Dec 18, 2024

[IA][RISCV] Support VP loads/stores in InterleavedAccessPass #120490

Open

lukel97 mentioned this pull request Jan 23, 2025

[IA] Generalize the support for power-of-two (de)interleave intrinsics #123863

Merged


		%struct.xyzt = type { i32, i32, i32, i32 }

		define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {

		// Make sure that the A,B,C,D are instructions of ExtractValue,
		// before getting the extract index

		DeadInsts.push_back(cast<Instruction>(II1));
		DeadInsts.push_back(cast<Instruction>(II2));

[IA]: Construct (de)interleave4 out of (de)interleave2 #89276

[IA]: Construct (de)interleave4 out of (de)interleave2 #89276

Conversation

hassnaaHamdi commented Apr 18, 2024 • edited Loading

llvmbot commented Apr 18, 2024 • edited Loading

github-actions bot commented Apr 18, 2024 • edited Loading

paulwalker-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

paulwalker-arm Apr 29, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

hassnaaHamdi commented May 16, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

paulwalker-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

paulwalker-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

paulwalker-arm Jun 21, 2024 • edited Loading

Choose a reason for hiding this comment

paulwalker-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

paulwalker-arm Aug 2, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

paulwalker-arm left a comment

Choose a reason for hiding this comment

hassnaaHamdi commented Apr 18, 2024 •

edited

Loading

llvmbot commented Apr 18, 2024 •

edited

Loading

github-actions bot commented Apr 18, 2024 •

edited

Loading

paulwalker-arm Apr 29, 2024 •

edited

Loading

paulwalker-arm Jun 21, 2024 •

edited

Loading

paulwalker-arm Aug 2, 2024 •

edited

Loading