[LV][VPlan] set FastMathFlags on EVLRecipe #119847

LiqinWeng · 2024-12-13T09:54:34Z

Currently support:

vp.fpext/vp.fptrunc of VPWidenCastRecipe
vp.select of VPWidenSelectRecipe
vp.select of VPInstruction

llvmbot · 2024-12-13T09:55:12Z

@llvm/pr-subscribers-vectorizers

Author: LiqinWeng (LiqinWeng)

Changes

Patch is 97.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119847.diff

10 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+5-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+5-2)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+13-3)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+219)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll (+222-71)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll (+4-4)
(added) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-select-intrinsics.ll (+258)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll (+96)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll (+210)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll (+110-58)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8794517b777f3b..26722e8519e9e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1661,8 +1661,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
-                         DebugLoc DL = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+                         FastMathFlags FMFs, DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, FMFs),
         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1676,9 +1676,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          std::initializer_list<VPValue *> CallArguments,
-                         Type *Ty, DebugLoc DL = {})
+                         Type *Ty, FastMathFlags FMFs, DebugLoc DL = {})
       : VPWidenIntrinsicRecipe(VectorIntrinsicID,
-                               ArrayRef<VPValue *>(CallArguments), Ty, DL) {}
+                               ArrayRef<VPValue *>(CallArguments), Ty, FMFs,
+                               DL) {}
 
   ~VPWidenIntrinsicRecipe() override = default;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e882368544e815..d92392bd4460e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -976,8 +976,11 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
     CI->getOperandBundlesAsDefs(OpBundles);
 
   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
-
-  setFlags(V);
+  // vector-predication intrinsics only accept FMF flags, while vector intrinsic
+  // can support all flags.
+  bool VPIntrinsic = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID);
+  if ((VPIntrinsic && isa<FPMathOperator>(V)) || !VPIntrinsic)
+    setFlags(V);
 
   if (!V->getType()->isVoidTy())
     State.set(this, V);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 922cba7831f4e9..fb6457028163c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1516,16 +1516,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                         IntegerType::getInt1Ty(CI->getContext())));
                     Ops.push_back(Mask);
                     Ops.push_back(&EVL);
+                    FastMathFlags FMF = {};
+                    if (isa<FPMathOperator>(CI))
+                      FMF = CI->getFastMathFlags();
                     return new VPWidenIntrinsicRecipe(
-                        VPID, Ops, TypeInfo.inferScalarType(CInst),
+                        VPID, Ops, TypeInfo.inferScalarType(CInst), FMF,
                         CInst->getDebugLoc());
                   })
               .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+                auto *SelInst = dyn_cast<SelectInst>(Sel->getUnderlyingInstr());
                 SmallVector<VPValue *> Ops(Sel->operands());
                 Ops.push_back(&EVL);
+                FastMathFlags FMF = {};
+                if (isa<FPMathOperator>(SelInst))
+                  FMF = SelInst->getFastMathFlags();
                 return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
                                                   TypeInfo.inferScalarType(Sel),
-                                                  Sel->getDebugLoc());
+                                                  FMF, Sel->getDebugLoc());
               })
               .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
                 VPValue *LHS, *RHS;
@@ -1540,9 +1547,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                 // limited to selects whose condition is a header mask.
                 VPValue *AllTrue =
                     Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+                FastMathFlags FMF = {};
+                if (VPI->hasFastMathFlags())
+                  FMF = VPI->getFastMathFlags();
                 return new VPWidenIntrinsicRecipe(
                     Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
-                    TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
+                    TypeInfo.inferScalarType(LHS), FMF, VPI->getDebugLoc());
               })
               .Default([&](VPRecipeBase *R) { return nullptr; });
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 14818199072c28..3f39d7628aaae5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -538,3 +538,222 @@ for.body:
 for.end:
   ret i32 %smin
 }
+
+define  nofpclass(nan inf) float @vp_reduction_with_fastflags(ptr %a, ptr %b, i64 %N, float %start) {
+; OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; OUTLOOP-NEXT:  entry:
+; OUTLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OUTLOOP:       vector.ph:
+; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; OUTLOOP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; OUTLOOP:       vector.body:
+; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; OUTLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP7]]
+; OUTLOOP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; OUTLOOP-NEXT:    [[TMP10]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; OUTLOOP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OUTLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; OUTLOOP:       middle.block:
+; OUTLOOP-NEXT:    [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP10]])
+; OUTLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; OUTLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OUTLOOP:       scalar.ph:
+; OUTLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; OUTLOOP:       for.body:
+; OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; OUTLOOP-NEXT:    [[TMP13:%.*]] = load float, ptr [[GEP]], align 4
+; OUTLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP13]], [[RDX]]
+; OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; OUTLOOP:       for.end:
+; OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INLOOP-LABEL: @vp_reduction_with_fastflags(
+; INLOOP-NEXT:  entry:
+; INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INLOOP:       vector.ph:
+; INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; INLOOP:       vector.body:
+; INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; INLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP6]]
+; INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; INLOOP-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[WIDE_LOAD]])
+; INLOOP-NEXT:    [[TMP10]] = fadd fast float [[TMP9]], [[VEC_PHI]]
+; INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; INLOOP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; INLOOP:       middle.block:
+; INLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; INLOOP:       scalar.ph:
+; INLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; INLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; INLOOP:       for.body:
+; INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; INLOOP-NEXT:    [[TMP12:%.*]] = load float, ptr [[GEP]], align 4
+; INLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP12]], [[RDX]]
+; INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; INLOOP:       for.end:
+; INLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; INLOOP-NEXT:    ret float [[ADD_LCSSA]]
+;
+; IF-EVL-OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-OUTLOOP-NEXT:  entry:
+; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP:       vector.ph:
+; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP:       vector.body:
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT:    [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-OUTLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VEC_PHI]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP14]] = call fast <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[VP_OP]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-OUTLOOP:       middle.block:
+; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP14]])
+; IF-EVL-OUTLOOP-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP:       scalar.ph:
+; IF-EVL-OUTLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP:       for.body:
+; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-OUTLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP18]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-OUTLOOP:       for.end:
+; IF-EVL-OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT:    ret float [[ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-INLOOP-NEXT:  entry:
+; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP:       vector.ph:
+; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP:       vector.body:
+; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP10]]
+; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vp.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT:    [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
+; IF-EVL-INLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP:       middle.block:
+; IF-EVL-INLOOP-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP:       scalar.ph:
+; IF-EVL-INLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-INLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP:       for.body:
+; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-INLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP17]], [[RDX]]
+; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+;...
[truncated]

llvmbot · 2024-12-13T09:55:13Z

@llvm/pr-subscribers-llvm-transforms

Author: LiqinWeng (LiqinWeng)

Changes

Patch is 97.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119847.diff

10 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+5-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+5-2)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+13-3)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+219)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll (+222-71)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll (+4-4)
(added) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-select-intrinsics.ll (+258)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll (+96)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll (+210)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll (+110-58)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8794517b777f3b..26722e8519e9e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1661,8 +1661,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
-                         DebugLoc DL = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+                         FastMathFlags FMFs, DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, FMFs),
         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1676,9 +1676,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          std::initializer_list<VPValue *> CallArguments,
-                         Type *Ty, DebugLoc DL = {})
+                         Type *Ty, FastMathFlags FMFs, DebugLoc DL = {})
       : VPWidenIntrinsicRecipe(VectorIntrinsicID,
-                               ArrayRef<VPValue *>(CallArguments), Ty, DL) {}
+                               ArrayRef<VPValue *>(CallArguments), Ty, FMFs,
+                               DL) {}
 
   ~VPWidenIntrinsicRecipe() override = default;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e882368544e815..d92392bd4460e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -976,8 +976,11 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
     CI->getOperandBundlesAsDefs(OpBundles);
 
   CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
-
-  setFlags(V);
+  // vector-predication intrinsics only accept FMF flags, while vector intrinsic
+  // can support all flags.
+  bool VPIntrinsic = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID);
+  if ((VPIntrinsic && isa<FPMathOperator>(V)) || !VPIntrinsic)
+    setFlags(V);
 
   if (!V->getType()->isVoidTy())
     State.set(this, V);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 922cba7831f4e9..fb6457028163c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1516,16 +1516,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                         IntegerType::getInt1Ty(CI->getContext())));
                     Ops.push_back(Mask);
                     Ops.push_back(&EVL);
+                    FastMathFlags FMF = {};
+                    if (isa<FPMathOperator>(CI))
+                      FMF = CI->getFastMathFlags();
                     return new VPWidenIntrinsicRecipe(
-                        VPID, Ops, TypeInfo.inferScalarType(CInst),
+                        VPID, Ops, TypeInfo.inferScalarType(CInst), FMF,
                         CInst->getDebugLoc());
                   })
               .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+                auto *SelInst = dyn_cast<SelectInst>(Sel->getUnderlyingInstr());
                 SmallVector<VPValue *> Ops(Sel->operands());
                 Ops.push_back(&EVL);
+                FastMathFlags FMF = {};
+                if (isa<FPMathOperator>(SelInst))
+                  FMF = SelInst->getFastMathFlags();
                 return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
                                                   TypeInfo.inferScalarType(Sel),
-                                                  Sel->getDebugLoc());
+                                                  FMF, Sel->getDebugLoc());
               })
               .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
                 VPValue *LHS, *RHS;
@@ -1540,9 +1547,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                 // limited to selects whose condition is a header mask.
                 VPValue *AllTrue =
                     Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+                FastMathFlags FMF = {};
+                if (VPI->hasFastMathFlags())
+                  FMF = VPI->getFastMathFlags();
                 return new VPWidenIntrinsicRecipe(
                     Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
-                    TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
+                    TypeInfo.inferScalarType(LHS), FMF, VPI->getDebugLoc());
               })
               .Default([&](VPRecipeBase *R) { return nullptr; });
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 14818199072c28..3f39d7628aaae5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -538,3 +538,222 @@ for.body:
 for.end:
   ret i32 %smin
 }
+
+define  nofpclass(nan inf) float @vp_reduction_with_fastflags(ptr %a, ptr %b, i64 %N, float %start) {
+; OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; OUTLOOP-NEXT:  entry:
+; OUTLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OUTLOOP:       vector.ph:
+; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; OUTLOOP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; OUTLOOP:       vector.body:
+; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; OUTLOOP-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; OUTLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP7]]
+; OUTLOOP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; OUTLOOP-NEXT:    [[TMP10]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; OUTLOOP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OUTLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; OUTLOOP:       middle.block:
+; OUTLOOP-NEXT:    [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP10]])
+; OUTLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; OUTLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OUTLOOP:       scalar.ph:
+; OUTLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; OUTLOOP:       for.body:
+; OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; OUTLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; OUTLOOP-NEXT:    [[TMP13:%.*]] = load float, ptr [[GEP]], align 4
+; OUTLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP13]], [[RDX]]
+; OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; OUTLOOP:       for.end:
+; OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; OUTLOOP-NEXT:    ret float [[ADD_LCSSA]]
+;
+; INLOOP-LABEL: @vp_reduction_with_fastflags(
+; INLOOP-NEXT:  entry:
+; INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INLOOP:       vector.ph:
+; INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; INLOOP:       vector.body:
+; INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; INLOOP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; INLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP6]]
+; INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; INLOOP-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[WIDE_LOAD]])
+; INLOOP-NEXT:    [[TMP10]] = fadd fast float [[TMP9]], [[VEC_PHI]]
+; INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; INLOOP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; INLOOP:       middle.block:
+; INLOOP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INLOOP-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; INLOOP:       scalar.ph:
+; INLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; INLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; INLOOP:       for.body:
+; INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; INLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; INLOOP-NEXT:    [[TMP12:%.*]] = load float, ptr [[GEP]], align 4
+; INLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP12]], [[RDX]]
+; INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; INLOOP:       for.end:
+; INLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; INLOOP-NEXT:    ret float [[ADD_LCSSA]]
+;
+; IF-EVL-OUTLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-OUTLOOP-NEXT:  entry:
+; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP:       vector.ph:
+; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> zeroinitializer, float [[START:%.*]], i32 0
+; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL-OUTLOOP:       vector.body:
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-OUTLOOP-NEXT:    [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-OUTLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VEC_PHI]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP14]] = call fast <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[VP_OP]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP10]])
+; IF-EVL-OUTLOOP-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-OUTLOOP:       middle.block:
+; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP14]])
+; IF-EVL-OUTLOOP-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-OUTLOOP:       scalar.ph:
+; IF-EVL-OUTLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL-OUTLOOP:       for.body:
+; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-OUTLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP18]], [[RDX]]
+; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-OUTLOOP:       for.end:
+; IF-EVL-OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-OUTLOOP-NEXT:    ret float [[ADD_LCSSA]]
+;
+; IF-EVL-INLOOP-LABEL: @vp_reduction_with_fastflags(
+; IF-EVL-INLOOP-NEXT:  entry:
+; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP:       vector.ph:
+; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL-INLOOP:       vector.body:
+; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-INLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP10]]
+; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vp.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT:    [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
+; IF-EVL-INLOOP-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
+; IF-EVL-INLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-INLOOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP:       middle.block:
+; IF-EVL-INLOOP-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL-INLOOP:       scalar.ph:
+; IF-EVL-INLOOP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-INLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL-INLOOP:       for.body:
+; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-INLOOP-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]]
+; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = load float, ptr [[GEP]], align 4
+; IF-EVL-INLOOP-NEXT:    [[ADD]] = fadd fast float [[TMP17]], [[RDX]]
+; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+;...
[truncated]

lukel97 · 2024-12-13T13:22:39Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

@@ -1516,16 +1516,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                        IntegerType::getInt1Ty(CI->getContext())));
                    Ops.push_back(Mask);
                    Ops.push_back(&EVL);
+                    FastMathFlags FMF = {};
+                    if (isa<FPMathOperator>(CI))
+                      FMF = CI->getFastMathFlags();


FastMathFlags on cast instructions seems to be a new thing? #115894

In that case should we preserve them in VPWidenCastRecipe/VPScalarCastRecipe first? I can't find anything there that propagates them

sunshaoce · 2024-12-16T02:14:36Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+  // vector-predication intrinsics only accept FMF flags, while vector intrinsic
+  // can support all flags.
+  bool VPIntrinsic = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID);
+  if ((VPIntrinsic && isa<FPMathOperator>(V)) || !VPIntrinsic)


if (!VPIntrinsic || isa<FPMathOperator>(V)) is better

I'm thinking whether we need this check.
Is it possible to use the existing constructor to pass in an OpType other than FPMathOp when constructing vp intrinsic?

VP intrinsic and non-vp intrinsic flags may not be completely consistent， eg: uitofp has the nneg IR flags, but llvm.vp.uitofp seems no this flag

Does the current VP intrinsic support other flags? Not yet, can we support fastmath flags first?

fhahn · 2024-12-20T10:19:16Z

llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll

+; IF-EVL-INLOOP-NEXT: }
+;
+
+; NO-VP-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {


Do we really need to NO-VP-* check lines in the file?

fhahn · 2024-12-20T10:23:20Z

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

                        CInst->getDebugLoc());
                  })
              .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+                auto *SelInst = dyn_cast<SelectInst>(Sel->getUnderlyingInstr());


Please only use flags from the recipe, not the underlying instruction. Using from the underlying instruction may be incorrect in some cases, e.g. if the recipe originally was only execute unconditionally.

Same for VPWidenCastRecipe above.

Maybe start with just Select VPInstructions here, then PRs to add flags to VPWidenCastRecipe/VPWidenSelectRecipe separately, then a PR to pass them through to the intrinsics.

Thank you very much for your patience. Wait until #119510 is processed properly, and I will correct it.

Sorry, I don't quite understand. ”Maybe start with just Select VPInstructions here“ ， VPWidenSelectRecipe inherit VPSingleDefRecipe， It seems that the only way to get the flags is by underlying instruction. Or change VPWidenSelectRecipe inheritance to VPRecipeWithIRFlags, do you think it is ok?

Or change VPWidenSelectRecipe inheritance to VPRecipeWithIRFlags, do you think it is ok?

+1

pls review： #121023

… than underlying instruction. This patch may cause the flags in the CallRecipe to be lost after EVL transformation, and it has been addressed in the patch: llvm#119847

… than underlying instruction (#120816) This patch may cause the flags in the CallRecipe to be lost after EVL transformation, and it has been addressed in the patch: #119847

…ass VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe

github-actions · 2024-12-25T02:32:40Z

✅ With the latest revision this PR passed the C/C++ code formatter.

…ass VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe

Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags The patch of llvm#119847 will add the fastmath flag to for recipe

Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags.The patch of #119847 will add the fastmath flag to for recipe

… EVLRecipe

Currently support: 1. vp.fpext/vp.fptrunc of VPWidenCastRecipe 2. vp.select of VPWidenSelectRecipe 3. vp.select of VPInstruction

Change the inheritance of class VPWidenSelectRecipe to class VPRecipeWithIRFlags, which allows recipe of the select to pass the fastmath flags.The patch of llvm#119847 will add the fastmath flag to for recipe

LiqinWeng requested review from ayalz, fhahn and Mel-Chen December 13, 2024 09:54

llvmbot added vectorizers llvm:transforms labels Dec 13, 2024

LiqinWeng changed the title ~~Set fast flags for evlrecipe~~ [LV][VPlan] set FastMathFlags on EVLRecipe Dec 13, 2024

lukel97 reviewed Dec 13, 2024

View reviewed changes

sunshaoce reviewed Dec 16, 2024

View reviewed changes

LiqinWeng mentioned this pull request Dec 20, 2024

[LV][VPlan] Extract the implementation of transform Recipe to EVLRecipe into a small function. NFC #119510

Merged

fhahn requested changes Dec 20, 2024

View reviewed changes

LiqinWeng mentioned this pull request Dec 21, 2024

[LV][VPlan] Use opcode to retrieve the VPID of the CallRecipe, rather than underlying instruction #120816

Merged

This was referenced Dec 24, 2024

[LV][VPlan] Add fast flags for selectRecipe #121023

Merged

[VPlan] Use VPWidenIntrinsicRecipe to support binary and unary operations with EVL-vectorization #114205

Open

LiqinWeng force-pushed the set-fast-flags-for-evlrecipe branch 2 times, most recently from d77dd75 to 6d7d92f Compare December 25, 2024 02:28

LiqinWeng added 3 commits January 15, 2025 10:43

[Test] Add test to set the flags for castInst and selectInst with the…

deb1fd8

… EVLRecipe

[LV][VPlan] set FastMathFlags on EVLRecipe

b9d7aa7

Currently support: 1. vp.fpext/vp.fptrunc of VPWidenCastRecipe 2. vp.select of VPWidenSelectRecipe 3. vp.select of VPInstruction

fix faied tests and rebase

41b1518

LiqinWeng force-pushed the set-fast-flags-for-evlrecipe branch from c971ccf to 41b1518 Compare January 15, 2025 08:27

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[LV][VPlan] set FastMathFlags on EVLRecipe #119847

[LV][VPlan] set FastMathFlags on EVLRecipe #119847

LiqinWeng commented Dec 13, 2024 •

edited

Loading

llvmbot commented Dec 13, 2024

llvmbot commented Dec 13, 2024

lukel97 Dec 13, 2024 •

edited

Loading

sunshaoce Dec 16, 2024

Mel-Chen Dec 23, 2024

LiqinWeng Jan 15, 2025

fhahn Dec 20, 2024

LiqinWeng Dec 27, 2024

fhahn Dec 20, 2024

LiqinWeng Dec 21, 2024

LiqinWeng Dec 23, 2024

Mel-Chen Dec 23, 2024

LiqinWeng Dec 24, 2024

github-actions bot commented Dec 25, 2024 •

edited

Loading

[LV][VPlan] set FastMathFlags on EVLRecipe #119847

Are you sure you want to change the base?

[LV][VPlan] set FastMathFlags on EVLRecipe #119847

Conversation

LiqinWeng commented Dec 13, 2024 • edited Loading

llvmbot commented Dec 13, 2024

llvmbot commented Dec 13, 2024

lukel97 Dec 13, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Dec 25, 2024 • edited Loading

LiqinWeng commented Dec 13, 2024 •

edited

Loading

lukel97 Dec 13, 2024 •

edited

Loading

github-actions bot commented Dec 25, 2024 •

edited

Loading