Skip to content

Commit

Permalink
[LV] Reuse VPReplicateRecipe to handle scalar stores in exit block. (#…
Browse files Browse the repository at this point in the history
…106342)

This patch separates the computation of the final reduction result and
the intermediate stores of reduction.

---------

Co-authored-by: Florian Hahn <[email protected]>
  • Loading branch information
Mel-Chen and fhahn authored Sep 30, 2024
1 parent 147558e commit f8373cb
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 76 deletions.
83 changes: 31 additions & 52 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2346,6 +2346,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
// End if-block.
VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
assert((Parent || all_of(RepRecipe->operands(),
[](VPValue *Op) {
return Op->isDefinedOutsideLoopRegions();
})) &&
"Expected a recipe is either within a region or all of its operands "
"are defined outside the vectorized region.");
if (IfPredicateInstr)
PredicatedInstructions.push_back(Cloned);
}
Expand Down Expand Up @@ -8950,6 +8956,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
return Legal->blockNeedsPredication(BB) || NeedsBlends;
});
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
Expand All @@ -8976,12 +8985,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Operands = {OpRange.begin(), OpRange.end()};
}

// Invariant stores inside loop will be deleted and a single store
// with the final reduction value will be added to the exit block
// The stores with invariant address inside the loop will be deleted, and
// in the exit block, a uniform store recipe will be created for the final
// invariant store of the reduction.
StoreInst *SI;
if ((SI = dyn_cast<StoreInst>(&I)) &&
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
// Only create recipe for the final invariant store of the reduction.
if (!Legal->isInvariantStoreOfReduction(SI))
continue;
auto *Recipe = new VPReplicateRecipe(
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
true /* IsUniform */);
Recipe->insertBefore(*MiddleVPBB, MBIP);
continue;
}

VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
Expand Down Expand Up @@ -9150,45 +9168,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
using namespace VPlanPatternMatch;
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
// sank outside of the loop would keep the same order as they had in the
// original loop.
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
ReductionPHIList.emplace_back(ReductionPhi);
}
bool HasIntermediateStore = false;
stable_sort(ReductionPHIList,
[this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
const VPReductionPHIRecipe *R2) {
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
HasIntermediateStore |= IS1 || IS2;

// If neither of the recipes has an intermediate store, keep the
// order the same.
if (!IS1 && !IS2)
return false;

// If only one of the recipes has an intermediate store, then
// move it towards the beginning of the list.
if (IS1 && !IS2)
return true;

if (!IS1 && IS2)
return false;

// If both recipes have an intermediate store, then the recipe
// with the later store should be processed earlier. So it
// should go to the beginning of the list.
return DT->dominates(IS2, IS1);
});

if (HasIntermediateStore && ReductionPHIList.size() > 1)
for (VPRecipeBase *R : ReductionPHIList)
R->moveBefore(*Header, Header->getFirstNonPhi());

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
Expand All @@ -9207,9 +9188,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPUser *U : Cur->users()) {
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
assert(match(U, m_Binary<VPInstruction::ExtractFromEnd>(
m_VPValue(), m_VPValue())) &&
"U must be an ExtractFromEnd VPInstruction");
assert(UserRecipe->getParent() == MiddleVPBB &&
"U must be either in the loop region or the middle block.");
continue;
}
Worklist.insert(UserRecipe);
Expand Down Expand Up @@ -9314,8 +9294,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
Expand Down Expand Up @@ -9390,12 +9368,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// also modeled in VPlan.
auto *FinalReductionResult = new VPInstruction(
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
// Update all users outside the vector region.
OrigExitingVPV->replaceUsesWithIf(
FinalReductionResult, [](VPUser &User, unsigned) {
auto *Parent = cast<VPRecipeBase>(&User)->getParent();
return Parent && !Parent->getParent();
});
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,
unsigned) {
return match(&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue(),
m_VPValue()));
});

// Adjust AnyOf reductions; replace the reduction phi for the selected value
// with a boolean reduction phi node to check if the condition is true in
Expand Down
8 changes: 0 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,14 +611,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
}

// If there were stores of the reduction value to a uniform memory address
// inside the loop, create the final store here.
if (StoreInst *SI = RdxDesc.IntermediateStore) {
auto *NewSI = Builder.CreateAlignedStore(
ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
propagateMetadata(NewSI, SI);
}

return ReducedPartRdx;
}
case VPInstruction::ExtractFromEnd: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -596,10 +596,10 @@ exit: ; preds = %for.body
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
;
entry:
br label %for.body
Expand All @@ -625,10 +625,10 @@ exit:
define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4
;
entry:
br label %for.body
Expand All @@ -655,10 +655,10 @@ exit:
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4
;
entry:
br label %for.body
Expand All @@ -684,10 +684,10 @@ exit:
define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
; CHECK: middle.block:
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4
; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4
;
entry:
br label %for.body
Expand Down
1 change: 1 addition & 0 deletions llvm/test/Transforms/LoopVectorize/vplan-printing.ll
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next>
; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
Expand Down

0 comments on commit f8373cb

Please sign in to comment.