Skip to content

Commit

Permalink
[flang][OpenMP] Extend delayed privatization for omp.simd (llvm#122156
Browse files Browse the repository at this point in the history
)

Adds support for delayed privatization for `simd` directives. This PR
includes PFT down to LLVM IR lowering.
  • Loading branch information
ergawy authored and Mel-Chen committed Jan 13, 2025
1 parent c6c4a87 commit d92926b
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 62 deletions.
8 changes: 4 additions & 4 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2144,19 +2144,19 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps,
simdReductionSyms);

// TODO: Support delayed privatization.
DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
/*useDelayedPrivatization=*/false, symTable);
dsp.processStep1();
enableDelayedPrivatization, symTable);
dsp.processStep1(&simdClauseOps);

mlir::omp::LoopNestOperands loopNestClauseOps;
llvm::SmallVector<const semantics::Symbol *> iv;
genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
loopNestClauseOps, iv);

EntryBlockArgs simdArgs;
// TODO: Add private syms and vars.
simdArgs.priv.syms = dsp.getDelayedPrivSymbols();
simdArgs.priv.vars = simdClauseOps.privateVars;
simdArgs.reduction.syms = simdReductionSyms;
simdArgs.reduction.vars = simdClauseOps.reductionVars;
auto simdOp =
Expand Down
6 changes: 3 additions & 3 deletions flang/test/Lower/OpenMP/order-clause.f90
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@

!CHECK-LABEL: func.func @_QPsimd_order() {
subroutine simd_order
!CHECK: omp.simd order(reproducible:concurrent) {
!CHECK: omp.simd order(reproducible:concurrent) private({{.*}}) {
!$omp simd order(concurrent)
do i = 1, 10
end do
!CHECK: omp.simd order(reproducible:concurrent) {
!CHECK: omp.simd order(reproducible:concurrent) private({{.*}}) {
!$omp simd order(reproducible:concurrent)
do i = 1, 10
end do
!CHECK: omp.simd order(unconstrained:concurrent) {
!CHECK: omp.simd order(unconstrained:concurrent) private({{.*}}) {
!$omp simd order(unconstrained:concurrent)
do i = 1, 10
end do
Expand Down
32 changes: 20 additions & 12 deletions flang/test/Lower/OpenMP/parallel-private-clause.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,23 @@
! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-hlfir %s -o - \
! RUN: | FileCheck %s --check-prefix=FIRDialect

! FIRDialect: omp.private {type = private} @_QFsimd_loop_1Er_private_ref_box_heap_f32 {{.*}} alloc {
! FIRDialect: [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
! FIRDialect: fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
! FIRDialect: omp.yield([[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>)
! FIRDialect: } dealloc {
! FIRDialect: ^bb0([[R_DECL:%.*]]: !fir.ref<!fir.box<!fir.heap<f32>>>):
! FIRDialect: {{%.*}} = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: fir.if {{%.*}} {
! FIRDialect: [[LD:%.*]] = fir.load [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
! FIRDialect: fir.freemem [[AD]] : !fir.heap<f32>
! FIRDialect: fir.store {{%.*}} to [[R_DECL]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: omp.yield
! FIRDialect: }

!FIRDialect: omp.private {type = private} @[[DERIVED_PRIVATIZER:_QFprivate_clause_derived_typeEt_private_ref_rec__QFprivate_clause_derived_typeTmy_type]] : !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> alloc {
!FIRDialect: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>):
!FIRDialect: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "_QFprivate_clause_derived_typeEt"}
Expand Down Expand Up @@ -246,7 +263,6 @@ subroutine parallel_pointer()
!$omp end parallel
end subroutine parallel_pointer


!FIRDialect-LABEL: func @_QPsimple_loop_1()
subroutine simple_loop_1
integer :: i
Expand Down Expand Up @@ -354,20 +370,17 @@ subroutine simple_loop_3
! FIRDialect: omp.terminator
end subroutine


!CHECK-LABEL: func @_QPsimd_loop_1()
subroutine simd_loop_1
integer :: i
real, allocatable :: r;
! FIRDialect: [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
! FIRDialect: fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)

! FIRDialect: %[[LB:.*]] = arith.constant 1 : i32
! FIRDialect: %[[UB:.*]] = arith.constant 9 : i32
! FIRDialect: %[[STEP:.*]] = arith.constant 1 : i32

! FIRDialect: omp.simd {
! FIRDialect: omp.simd private({{.*}}) {
! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
!$OMP SIMD PRIVATE(r)
do i=1, 9
Expand All @@ -378,10 +391,5 @@ subroutine simd_loop_1
end do
!$OMP END SIMD
! FIRDialect: omp.yield
! FIRDialect: {{%.*}} = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: fir.if {{%.*}} {
! FIRDialect: [[LD:%.*]] = fir.load [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
! FIRDialect: [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
! FIRDialect: fir.freemem [[AD]] : !fir.heap<f32>
! FIRDialect: fir.store {{%.*}} to [[R_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>

end subroutine
28 changes: 13 additions & 15 deletions flang/test/Lower/OpenMP/simd.f90
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ subroutine simd
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK-NEXT: omp.simd {
! CHECK-NEXT: omp.simd private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i=1, 9
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -33,7 +33,7 @@ subroutine simd_with_if_clause(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd if(%[[COND:.*]]) {
! CHECK: omp.simd if(%[[COND:.*]]) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -52,7 +52,7 @@ subroutine simd_with_simdlen_clause(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd simdlen(2) {
! CHECK: omp.simd simdlen(2) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -72,7 +72,7 @@ subroutine simd_with_simdlen_clause_from_param(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd simdlen(2) {
! CHECK: omp.simd simdlen(2) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -92,7 +92,7 @@ subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd simdlen(6) {
! CHECK: omp.simd simdlen(6) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -111,7 +111,7 @@ subroutine simd_with_safelen_clause(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd safelen(2) {
! CHECK: omp.simd safelen(2) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -131,7 +131,7 @@ subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd safelen(6) {
! CHECK: omp.simd safelen(6) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -150,7 +150,7 @@ subroutine simd_with_simdlen_safelen_clause(n, threshold)
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK: omp.simd safelen(2) simdlen(1) {
! CHECK: omp.simd safelen(2) simdlen(1) private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
do i = 1, n
! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
Expand All @@ -171,7 +171,7 @@ subroutine simd_with_collapse_clause(n)
! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
! CHECK: omp.simd {
! CHECK: omp.simd private({{.*}}) {
! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
! CHECK-SAME: %[[LOWER_I]], %[[LOWER_J]]) to (
! CHECK-SAME: %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
Expand Down Expand Up @@ -235,7 +235,7 @@ subroutine simd_with_nontemporal_clause(n)
!CHECK: %[[LB:.*]] = arith.constant 1 : i32
!CHECK: %[[UB:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
!CHECK: %[[STEP:.*]] = arith.constant 1 : i32
!CHECK: omp.simd nontemporal(%[[A_DECL]]#1, %[[C_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>) {
!CHECK: omp.simd nontemporal(%[[A_DECL]]#1, %[[C_DECL]]#1 : !fir.ref<i32>, !fir.ref<i32>) private({{.*}}) {
!CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
!$OMP SIMD NONTEMPORAL(A, C)
do i = 1, n
Expand All @@ -249,16 +249,14 @@ subroutine lastprivate_with_simd

!CHECK: %[[VAR_SUM:.*]] = fir.alloca f32 {bindc_name = "sum", uniq_name = "_QFlastprivate_with_simdEsum"}
!CHECK: %[[VAR_SUM_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM]] {{.*}}
!CHECK: %[[VAR_SUM_PINNED:.*]] = fir.alloca f32 {bindc_name = "sum", pinned, uniq_name = "_QFlastprivate_with_simdEsum"}
!CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}

implicit none
integer :: i
real :: sum


!CHECK: omp.simd {
!CHECK: omp.simd private(@_QFlastprivate_with_simdEsum_private_ref_f32 %[[VAR_SUM_DECLARE]]#0 -> %[[VAR_SUM_PINNED:.*]], @{{.*}}) {
!CHECK: omp.loop_nest (%[[ARG:.*]]) : i32 = ({{.*}} to ({{.*}}) inclusive step ({{.*}}) {
!CHECK: %[[VAR_SUM_PINNED_DECLARE:.*]]:2 = hlfir.declare %[[VAR_SUM_PINNED]] {{.*}}
!CHECK: %[[ADD_RESULT:.*]] = arith.addi {{.*}}
!CHECK: %[[ADD_RESULT_CONVERT:.*]] = fir.convert %[[ADD_RESULT]] : (i32) -> f32
!CHECK: hlfir.assign %[[ADD_RESULT_CONVERT]] to %[[VAR_SUM_PINNED_DECLARE]]#0 : f32, !fir.ref<f32>
Expand All @@ -283,7 +281,7 @@ subroutine simd_with_reduction_clause
! CHECK: %[[LB:.*]] = arith.constant 1 : i32
! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
! CHECK-NEXT: omp.simd private({{.*}}) reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
!$omp simd reduction(+:x)
do i=1, 9
Expand Down
7 changes: 1 addition & 6 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5203,12 +5203,7 @@ void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
Function *F = CanonicalLoop->getFunction();

// Define where if branch should be inserted
Instruction *SplitBefore;
if (Instruction::classof(IfCond)) {
SplitBefore = dyn_cast<Instruction>(IfCond);
} else {
SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
}
Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();

// TODO: We should not rely on pass manager. Currently we use pass manager
// only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2116,7 +2116,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
/*linear_vars=*/{}, /*linear_step_vars=*/{},
clauses.nontemporalVars, clauses.order, clauses.orderMod,
/*private_vars=*/{}, /*private_syms=*/nullptr,
clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms),
clauses.reductionVars,
makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
.Case([&](omp::SimdOp op) {
checkLinear(op, result);
checkNontemporal(op, result);
checkPrivate(op, result);
checkReduction(op, result);
})
.Case<omp::AtomicReadOp, omp::AtomicWriteOp, omp::AtomicUpdateOp,
Expand Down Expand Up @@ -2230,8 +2229,28 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
if (failed(checkImplementationStatus(opInst)))
return failure();

MutableArrayRef<BlockArgument> privateBlockArgs =
cast<omp::BlockArgOpenMPOpInterface>(*simdOp).getPrivateBlockArgs();
SmallVector<mlir::Value> mlirPrivateVars;
SmallVector<llvm::Value *> llvmPrivateVars;
SmallVector<omp::PrivateClauseOp> privateDecls;
mlirPrivateVars.reserve(privateBlockArgs.size());
llvmPrivateVars.reserve(privateBlockArgs.size());
collectPrivatizationDecls(simdOp, privateDecls);

for (mlir::Value privateVar : simdOp.getPrivateVars())
mlirPrivateVars.push_back(privateVar);

llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
findAllocaInsertPoint(builder, moduleTranslation);
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);

llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
builder, moduleTranslation, privateBlockArgs, privateDecls,
mlirPrivateVars, llvmPrivateVars, allocaIP);
if (handleError(afterAllocas, opInst).failed())
return failure();

// Generator of the canonical loop body.
SmallVector<llvm::CanonicalLoopInfo *> loopInfos;
SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> bodyInsertPoints;
Expand Down Expand Up @@ -2331,7 +2350,9 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
order, simdlen, safelen);

builder.restoreIP(afterIP);
return success();

return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(),
llvmPrivateVars, privateDecls);
}

/// Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
Expand Down
Loading

0 comments on commit d92926b

Please sign in to comment.