-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV][VPlan] set FastMathFlags on EVLRecipe #119847
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers Author: LiqinWeng (LiqinWeng) ChangesPatch is 97.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119847.diff 10 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8794517b777f3b..26722e8519e9e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1661,8 +1661,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
- DebugLoc DL = {})
- : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+ FastMathFlags FMFs, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, FMFs),
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
LLVMContext &Ctx = Ty->getContext();
AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1676,9 +1676,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
std::initializer_list<VPValue *> CallArguments,
- Type *Ty, DebugLoc DL = {})
+ Type *Ty, FastMathFlags FMFs, DebugLoc DL = {})
: VPWidenIntrinsicRecipe(VectorIntrinsicID,
- ArrayRef<VPValue *>(CallArguments), Ty, DL) {}
+ ArrayRef<VPValue *>(CallArguments), Ty, FMFs,
+ DL) {}
~VPWidenIntrinsicRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e882368544e815..d92392bd4460e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -976,8 +976,11 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
CI->getOperandBundlesAsDefs(OpBundles);
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
-
- setFlags(V);
+ // vector-predication intrinsics only accept FMF flags, while vector intrinsic
+ // can support all flags.
+ bool VPIntrinsic = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID);
+ if ((VPIntrinsic && isa<FPMathOperator>(V)) || !VPIntrinsic)
+ setFlags(V);
if (!V->getType()->isVoidTy())
State.set(this, V);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 922cba7831f4e9..fb6457028163c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1516,16 +1516,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
IntegerType::getInt1Ty(CI->getContext())));
Ops.push_back(Mask);
Ops.push_back(&EVL);
+ FastMathFlags FMF = {};
+ if (isa<FPMathOperator>(CI))
+ FMF = CI->getFastMathFlags();
return new VPWidenIntrinsicRecipe(
- VPID, Ops, TypeInfo.inferScalarType(CInst),
+ VPID, Ops, TypeInfo.inferScalarType(CInst), FMF,
CInst->getDebugLoc());
})
.Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+ auto *SelInst = dyn_cast<SelectInst>(Sel->getUnderlyingInstr());
SmallVector<VPValue *> Ops(Sel->operands());
Ops.push_back(&EVL);
+ FastMathFlags FMF = {};
+ if (isa<FPMathOperator>(SelInst))
+ FMF = SelInst->getFastMathFlags();
return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
TypeInfo.inferScalarType(Sel),
- Sel->getDebugLoc());
+ FMF, Sel->getDebugLoc());
})
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
VPValue *LHS, *RHS;
@@ -1540,9 +1547,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// limited to selects whose condition is a header mask.
VPValue *AllTrue =
Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+ FastMathFlags FMF = {};
+ if (VPI->hasFastMathFlags())
+ FMF = VPI->getFastMathFlags();
return new VPWidenIntrinsicRecipe(
Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
- TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
+ TypeInfo.inferScalarType(LHS), FMF, VPI->getDebugLoc());
})
.Default([&](VPRecipeBase *R) { return nullptr; });
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 14818199072c28..3f39d7628aaae5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -538,3 +538,222 @@ for.body:
for.end:
ret i32 %smin
}
+
+define nofpclass(nan inf) float @vp_reduction_with_fastflags(ptr %a, ptr %b, i64 %N, float %start) {
+; OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; OUTLOOP-NEXT: entry:
+; OUTLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OUTLOOP: vector.ph:
+; OUTLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; OUTLOOP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; OUTLOOP: vector.body:
+; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP7]]
+; OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; OUTLOOP-NEXT: [[TMP10]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; OUTLOOP: middle.block:
+; OUTLOOP-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP10]])
+; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OUTLOOP: scalar.ph:
+; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; OUTLOOP: for.body:
+; OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; OUTLOOP-NEXT: [[TMP13:%.*]] = load float, ptr [[GEP]], align 4
+; OUTLOOP-NEXT: [[ADD]] = fadd fast float [[TMP13]], [[RDX]]
+; OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; OUTLOOP: for.end:
+; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT: ret float [[ADD_LCSSA]]
+;
+; INLOOP-LABEL: @vp_reduction_with_fastflags(
+; INLOOP-NEXT: entry:
+; INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INLOOP: vector.ph:
+; INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; INLOOP: vector.body:
+; INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP6]]
+; INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; INLOOP-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[WIDE_LOAD]])
+; INLOOP-NEXT: [[TMP10]] = fadd fast float [[TMP9]], [[VEC_PHI]]
+; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; INLOOP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; INLOOP: middle.block:
+; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; INLOOP: scalar.ph:
+; INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; INLOOP: for.body:
+; INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; INLOOP-NEXT: [[TMP12:%.*]] = load float, ptr [[GEP]], align 4
+; INLOOP-NEXT: [[ADD]] = fadd fast float [[TMP12]], [[RDX]]
+; INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; INLOOP: for.end:
+; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; INLOOP-NEXT: ret float [[ADD_LCSSA]]
+;
+; IF-EVL-OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP: vector.ph:
+; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: vector.body:
+; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VEC_PHI]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = call fast <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[VP_OP]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-OUTLOOP: middle.block:
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP14]])
+; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = fadd fast float [[TMP18]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: ret float [[ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = call fast float @llvm.vp.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT: [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-INLOOP-NEXT: [[ADD]] = fadd fast float [[TMP17]], [[RDX]]
+; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+;...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: LiqinWeng (LiqinWeng) ChangesPatch is 97.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119847.diff 10 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8794517b777f3b..26722e8519e9e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1661,8 +1661,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
ArrayRef<VPValue *> CallArguments, Type *Ty,
- DebugLoc DL = {})
- : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+ FastMathFlags FMFs, DebugLoc DL = {})
+ : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, FMFs),
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
LLVMContext &Ctx = Ty->getContext();
AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1676,9 +1676,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
std::initializer_list<VPValue *> CallArguments,
- Type *Ty, DebugLoc DL = {})
+ Type *Ty, FastMathFlags FMFs, DebugLoc DL = {})
: VPWidenIntrinsicRecipe(VectorIntrinsicID,
- ArrayRef<VPValue *>(CallArguments), Ty, DL) {}
+ ArrayRef<VPValue *>(CallArguments), Ty, FMFs,
+ DL) {}
~VPWidenIntrinsicRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e882368544e815..d92392bd4460e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -976,8 +976,11 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
CI->getOperandBundlesAsDefs(OpBundles);
CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
-
- setFlags(V);
+ // vector-predication intrinsics only accept FMF flags, while vector intrinsic
+ // can support all flags.
+ bool VPIntrinsic = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID);
+ if ((VPIntrinsic && isa<FPMathOperator>(V)) || !VPIntrinsic)
+ setFlags(V);
if (!V->getType()->isVoidTy())
State.set(this, V);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 922cba7831f4e9..fb6457028163c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1516,16 +1516,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
IntegerType::getInt1Ty(CI->getContext())));
Ops.push_back(Mask);
Ops.push_back(&EVL);
+ FastMathFlags FMF = {};
+ if (isa<FPMathOperator>(CI))
+ FMF = CI->getFastMathFlags();
return new VPWidenIntrinsicRecipe(
- VPID, Ops, TypeInfo.inferScalarType(CInst),
+ VPID, Ops, TypeInfo.inferScalarType(CInst), FMF,
CInst->getDebugLoc());
})
.Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+ auto *SelInst = dyn_cast<SelectInst>(Sel->getUnderlyingInstr());
SmallVector<VPValue *> Ops(Sel->operands());
Ops.push_back(&EVL);
+ FastMathFlags FMF = {};
+ if (isa<FPMathOperator>(SelInst))
+ FMF = SelInst->getFastMathFlags();
return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
TypeInfo.inferScalarType(Sel),
- Sel->getDebugLoc());
+ FMF, Sel->getDebugLoc());
})
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
VPValue *LHS, *RHS;
@@ -1540,9 +1547,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// limited to selects whose condition is a header mask.
VPValue *AllTrue =
Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+ FastMathFlags FMF = {};
+ if (VPI->hasFastMathFlags())
+ FMF = VPI->getFastMathFlags();
return new VPWidenIntrinsicRecipe(
Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
- TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
+ TypeInfo.inferScalarType(LHS), FMF, VPI->getDebugLoc());
})
.Default([&](VPRecipeBase *R) { return nullptr; });
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 14818199072c28..3f39d7628aaae5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -538,3 +538,222 @@ for.body:
for.end:
ret i32 %smin
}
+
+define nofpclass(nan inf) float @vp_reduction_with_fastflags(ptr %a, ptr %b, i64 %N, float %start) {
+; OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; OUTLOOP-NEXT: entry:
+; OUTLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OUTLOOP: vector.ph:
+; OUTLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; OUTLOOP-NEXT: [[TMP6:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; OUTLOOP: vector.body:
+; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP7]]
+; OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; OUTLOOP-NEXT: [[TMP10]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; OUTLOOP: middle.block:
+; OUTLOOP-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP10]])
+; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OUTLOOP: scalar.ph:
+; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; OUTLOOP: for.body:
+; OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; OUTLOOP-NEXT: [[TMP13:%.*]] = load float, ptr [[GEP]], align 4
+; OUTLOOP-NEXT: [[ADD]] = fadd fast float [[TMP13]], [[RDX]]
+; OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; OUTLOOP: for.end:
+; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT: ret float [[ADD_LCSSA]]
+;
+; INLOOP-LABEL: @vp_reduction_with_fastflags(
+; INLOOP-NEXT: entry:
+; INLOOP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INLOOP: vector.ph:
+; INLOOP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; INLOOP: vector.body:
+; INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP6]]
+; INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; INLOOP-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[WIDE_LOAD]])
+; INLOOP-NEXT: [[TMP10]] = fadd fast float [[TMP9]], [[VEC_PHI]]
+; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; INLOOP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; INLOOP: middle.block:
+; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; INLOOP: scalar.ph:
+; INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; INLOOP: for.body:
+; INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; INLOOP-NEXT: [[TMP12:%.*]] = load float, ptr [[GEP]], align 4
+; INLOOP-NEXT: [[ADD]] = fadd fast float [[TMP12]], [[RDX]]
+; INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; INLOOP: for.end:
+; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; INLOOP-NEXT: ret float [[ADD_LCSSA]]
+;
+; IF-EVL-OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-OUTLOOP-NEXT: entry:
+; IF-EVL-OUTLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP: vector.ph:
+; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: vector.body:
+; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VEC_PHI]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = call fast <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[VP_OP]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-OUTLOOP: middle.block:
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP14]])
+; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP: scalar.ph:
+; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP: for.body:
+; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-OUTLOOP-NEXT: [[ADD]] = fadd fast float [[TMP18]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-OUTLOOP: for.end:
+; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT: ret float [[ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-INLOOP-NEXT: entry:
+; IF-EVL-INLOOP-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-INLOOP-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP: vector.ph:
+; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP: vector.body:
+; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP10]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = call fast float @llvm.vp.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT: [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
+; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP: middle.block:
+; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP: scalar.ph:
+; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP: for.body:
+; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-INLOOP-NEXT: [[ADD]] = fadd fast float [[TMP17]], [[RDX]]
+; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+;...
[truncated]
|
@@ -1516,16 +1516,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |||
IntegerType::getInt1Ty(CI->getContext()))); | |||
Ops.push_back(Mask); | |||
Ops.push_back(&EVL); | |||
FastMathFlags FMF = {}; | |||
if (isa<FPMathOperator>(CI)) | |||
FMF = CI->getFastMathFlags(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FastMathFlags on cast instructions seems to be a new thing? #115894
In that case should we preserve them in VPWidenCastRecipe/VPScalarCastRecipe first? I can't find anything there that propagates them
// vector-predication intrinsics only accept FMF flags, while vector intrinsic | ||
// can support all flags. | ||
bool VPIntrinsic = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID); | ||
if ((VPIntrinsic && isa<FPMathOperator>(V)) || !VPIntrinsic) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!VPIntrinsic || isa<FPMathOperator>(V))
is better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm thinking whether we need this check.
Is it possible to use the existing constructor to pass in an OpType
other than FPMathOp
when constructing vp intrinsic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- VP intrinsic and non-vp intrinsic flags may not be completely consistent, eg: uitofp has the nneg IR flags, but llvm.vp.uitofp seems no this flag
- Does the current VP intrinsic support other flags? Not yet, can we support fastmath flags first?
; IF-EVL-INLOOP-NEXT: } | ||
; | ||
|
||
; NO-VP-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we really need to NO-VP-* check lines in the file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed~
CInst->getDebugLoc()); | ||
}) | ||
.Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) { | ||
auto *SelInst = dyn_cast<SelectInst>(Sel->getUnderlyingInstr()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please only use flags from the recipe, not the underlying instruction. Using from the underlying instruction may be incorrect in some cases, e.g. if the recipe originally was only execute unconditionally.
Same for VPWidenCastRecipe
above.
Maybe start with just Select
VPInstructions here, then PRs to add flags to VPWidenCastRecipe
/VPWidenSelectRecipe
separately, then a PR to pass them through to the intrinsics.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you very much for your patience. Wait until #119510 is processed properly, and I will correct it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I don't quite understand. ”Maybe start with just Select VPInstructions here“ , VPWidenSelectRecipe inherit VPSingleDefRecipe, It seems that the only way to get the flags is by underlying instruction. Or change VPWidenSelectRecipe inheritance to VPRecipeWithIRFlags, do you think it is ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or change VPWidenSelectRecipe inheritance to VPRecipeWithIRFlags, do you think it is ok?
+1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pls review: #121023
… than underlying instruction. This patch may cause the flags in the CallRecipe to be lost after EVL transformation, and it has been addressed in the patch: llvm#119847
… than underlying instruction. This patch may cause the flags in the CallRecipe to be lost after EVL transformation, and it has been addressed in the patch: llvm#119847
…ass VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe
d77dd75
to
6d7d92f
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
…ass VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe
Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe
Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe
Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags.The patch of #119847 will add the fastmath flag to for recipe
Currently support: 1. vp.fpext/vp.fptrunc of VPWidenCastRecipe 2. vp.select of VPWidenSelectRecipe 3. vp.select of VPInstruction
c971ccf
to
41b1518
Compare
Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags.The patch of llvm#119847 will add the fastmath flag to for recipe
Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags.The patch of llvm#119847 will add the fastmath flag to for recipe
Currently support: