-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Reuse VPReplicateRecipe to handle scalar stores in exit block. #106342
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesThis patch separates the computation of the final reduction result and the intermediate stores of reduction. Full diff: https://github.com/llvm/llvm-project/pull/106342.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6850b502939f58..f855d5f2f8de25 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2388,8 +2388,8 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
AC->registerAssumption(II);
// End if-block.
- bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
- if (IfPredicateInstr)
+ const VPRegionBlock *Region = RepRecipe->getParent()->getParent();
+ if (Region && Region->isReplicator())
PredicatedInstructions.push_back(Cloned);
}
@@ -8901,6 +8901,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
return Legal->blockNeedsPredication(BB) || NeedsBlends;
});
+ auto *MiddleVPBB =
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
+ VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -8931,8 +8934,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// with the final reduction value will be added to the exit block
StoreInst *SI;
if ((SI = dyn_cast<StoreInst>(&I)) &&
- Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+ // Only create recipe for the last intermediate store of the reduction.
+ if (!Legal->isInvariantStoreOfReduction(SI))
+ continue;
+ auto *Recipe = new VPReplicateRecipe(
+ SI, RecipeBuilder.mapToVPValues(Instr->operands()),
+ true /* IsUniform */);
+ RecipeBuilder.setRecipe(SI, Recipe);
+ Recipe->insertBefore(*MiddleVPBB, MBIP);
continue;
+ }
VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
@@ -9130,51 +9142,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
using namespace VPlanPatternMatch;
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
- // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
- // sank outside of the loop would keep the same order as they had in the
- // original loop.
- SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
- ReductionPHIList.emplace_back(ReductionPhi);
- }
- bool HasIntermediateStore = false;
- stable_sort(ReductionPHIList,
- [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
- const VPReductionPHIRecipe *R2) {
- auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
- auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
- HasIntermediateStore |= IS1 || IS2;
-
- // If neither of the recipes has an intermediate store, keep the
- // order the same.
- if (!IS1 && !IS2)
- return false;
-
- // If only one of the recipes has an intermediate store, then
- // move it towards the beginning of the list.
- if (IS1 && !IS2)
- return true;
-
- if (!IS1 && IS2)
- return false;
-
- // If both recipes have an intermediate store, then the recipe
- // with the later store should be processed earlier. So it
- // should go to the beginning of the list.
- return DT->dominates(IS2, IS1);
- });
-
- if (HasIntermediateStore && ReductionPHIList.size() > 1)
- for (VPRecipeBase *R : ReductionPHIList)
- R->moveBefore(*Header, Header->getFirstNonPhi());
-
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ StoreInst* IntermediateStore = RdxDesc.IntermediateStore;
RecurKind Kind = RdxDesc.getRecurrenceKind();
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
"AnyOf reductions are not allowed for in-loop reductions");
@@ -9187,9 +9161,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPUser *U : Cur->users()) {
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
- assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
- m_VPValue(), m_VPValue())) &&
- "U must be an ExtractFromEnd VPInstruction");
+ assert((match(U, m_Binary<VPInstruction::ExtractFromEnd>(
+ m_VPValue(), m_VPValue())) ||
+ (isa<VPReplicateRecipe>(U) &&
+ cast<VPReplicateRecipe>(U)->getUnderlyingValue() ==
+ IntermediateStore)) &&
+ "U must be either an ExtractFromEnd VPInstruction or a "
+ "uniform store sourced from the intermediate store.");
continue;
}
Worklist.insert(UserRecipe);
@@ -9304,6 +9282,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ StoreInst *IntermediateStore = RdxDesc.IntermediateStore;
// Adjust AnyOf reductions; replace the reduction phi for the selected value
// with a boolean reduction phi node to check if the condition is true in
// any iteration. The final value is selected by the final
@@ -9406,11 +9385,14 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
auto *FinalReductionResult = new VPInstruction(
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
- OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
- unsigned) {
- return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
- m_VPValue()));
- });
+ OrigExitingVPV->replaceUsesWithIf(
+ FinalReductionResult, [IntermediateStore](VPUser &User, unsigned) {
+ return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(
+ m_VPValue(), m_VPValue())) ||
+ (isa<VPReplicateRecipe>(&User) &&
+ cast<VPReplicateRecipe>(&User)->getUnderlyingValue() ==
+ IntermediateStore);
+ });
}
VPlanTransforms::clearReductionWrapFlags(*Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 53b28a692059f6..2689f9cf31117c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -601,14 +601,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
}
- // If there were stores of the reduction value to a uniform memory address
- // inside the loop, create the final store here.
- if (StoreInst *SI = RdxDesc.IntermediateStore) {
- auto *NewSI = Builder.CreateAlignedStore(
- ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
- propagateMetadata(NewSI, SI);
- }
-
return ReducedPartRdx;
}
case VPInstruction::ExtractFromEnd: {
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 8cf4e77a0d4990..0d9918b74a2ff2 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -596,10 +596,10 @@ exit: ; preds = %for.body
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
+; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
;
entry:
br label %for.body
@@ -625,10 +625,10 @@ exit:
define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
+; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
;
entry:
br label %for.body
@@ -655,10 +655,10 @@ exit:
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4
+; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4
;
entry:
br label %for.body
@@ -684,10 +684,10 @@ exit:
define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
-; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
-; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
+; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4
+; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index f18ed825a6b886..af74b18211f208 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
+; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
3ce39e4
to
8d8635f
Compare
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks this looks like a nice use of VPReplicateRecipe to simplify handing of reductions with a store to an invariant address.
Unless I am missing something, using VPReplicateRecipe here should be fine, but would be good do add some extra assertions to guard against unintentional mis-use
for (VPRecipeBase &R : Header->phis()) { | ||
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); | ||
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) | ||
continue; | ||
|
||
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); | ||
StoreInst *IntermediateStore = RdxDesc.IntermediateStore; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
only used in assertion, will need to by sunk into assert (preferable) or mark as potentially unused to avoid warning
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sense, thanks.
912cccf
@@ -9304,6 +9282,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |||
continue; | |||
|
|||
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); | |||
StoreInst *IntermediateStore = RdxDesc.IntermediateStore; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
only used in assertion, will need to by sunk into assert (preferable) or mark as potentially unused to avoid warning
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not used in assertion. It used in OrigExitingVPV->replaceUsesWithIf
.
Would you like to move it closer to where it will be used?
8d8635f
to
40b40fa
Compare
40b40fa
to
206be4f
Compare
Ping |
8df0b23
to
c3b41e2
Compare
@@ -8956,8 +8965,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { | |||
// with the final reduction value will be added to the exit block |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update comment, possibly merge with the new one below?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
76d4dde
And replace last
with final
for aligning the term with the comment of isInvariantStoreOfReduction
.
/// Returns True if given store is a final invariant store of one of the
/// reductions found in the loop.
bool isInvariantStoreOfReduction(StoreInst *SI);
assert((match(U, m_Binary<VPInstruction::ExtractFromEnd>( | ||
m_VPValue(), m_VPValue())) || | ||
(isa<VPReplicateRecipe>(U) && | ||
cast<VPReplicateRecipe>(U)->getUnderlyingValue() == | ||
RdxDesc.IntermediateStore)) && | ||
"U must be either an ExtractFromEnd VPInstruction or a " | ||
"uniform store sourced from the intermediate store."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
probably time to remove the assert or adjust to allow any recipe in the middle block instead of checking specific recipes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree.
a3a6614
c3b41e2
to
a3a6614
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks, with one remaining suggestion
auto *Recipe = new VPReplicateRecipe( | ||
SI, RecipeBuilder.mapToVPValues(Instr->operands()), | ||
true /* IsUniform */); | ||
RecipeBuilder.setRecipe(SI, Recipe); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for your suggestion :)
It has been removed. After confirm, I think it is not needed, as there is no use case for it at the moment.
Refine the update lambda of ComputeReductionResult. Co-authored-by: Florian Hahn <[email protected]>
a3a6614
to
b494e9c
Compare
…lvm#106342) This patch separates the computation of the final reduction result and the intermediate stores of reduction. --------- Co-authored-by: Florian Hahn <[email protected]>
This patch separates the computation of the final reduction result and the intermediate stores of reduction.