From 3a04cf771108e270105dfc2a7b4b1d7dc7b078c1 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 14 May 2024 13:08:52 +0000 Subject: [PATCH 1/4] Test cases for vscale immediates --- .../AArch64/vscale-fixups.ll | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll new file mode 100644 index 0000000000000..367806ac438b0 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc %s -o - | FileCheck %s --check-prefixes=COMMON,BASE +;; Additional runlines to exercise lsr code which AArch64 normally wouldn't. +; RUN: llc %s -o - -lsr-preferred-addressing-mode=preindexed | FileCheck %s --check-prefixes=COMMON,PREINDEX +; RUN: llc %s -o - -lsr-preferred-addressing-mode=postindexed | FileCheck %s --check-prefixes=COMMON,POSTINDEX + +target triple = "aarch64-unknown-linux-gnu" + +define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 { +; COMMON-LABEL: mulvl123_addressing: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: ptrue p0.b +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: .LBB0_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ld1b { z0.b }, p0/z, [x0] +; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] +; COMMON-NEXT: ld1b { z2.b }, p0/z, [x0, #2, mul vl] +; COMMON-NEXT: ld1b { z3.b }, p0/z, [x0, #3, mul vl] +; COMMON-NEXT: addvl x0, x0, #5 +; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b +; COMMON-NEXT: movprfx z1, z2 +; COMMON-NEXT: umax z1.b, p0/m, z1.b, z3.b +; COMMON-NEXT: umax z0.b, p0/m, z0.b, z1.b +; COMMON-NEXT: st1b { z0.b }, p0, [x1, x8] +; COMMON-NEXT: addvl x8, x8, #1 +; COMMON-NEXT: cmp x8, x2 +; COMMON-NEXT: b.lo .LBB0_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %2 = shl nuw nsw i64 %vscale, 4 + %mul = shl nuw nsw i64 %vscale, 6 + br label %for.body + +for.body: + %src.addr = phi ptr [ %src, %entry ], [ %src.addr.next, %for.body ] + %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src.addr, i64 %idx + %3 = load , ptr %arrayidx + %4 = getelementptr , ptr %arrayidx, i64 1 + %5 = load , ptr %4 + %6 = getelementptr , ptr %arrayidx, i64 2 + %7 = load , ptr %6 + %8 = getelementptr , ptr %arrayidx, i64 3 + %9 = load , ptr %8 + %10 = tail call @llvm.umax.nxv16i8( %3, %5) + %11 = tail call @llvm.umax.nxv16i8( %7, %9) + %12 = tail call @llvm.umax.nxv16i8( %10, %11) + %src.addr.next = getelementptr inbounds i8, ptr %src.addr, i64 %mul + %arrayidx4 = getelementptr inbounds i8, ptr %dst, i64 %idx + store %12, ptr %arrayidx4 + %idx.next = add i64 %idx, %2 + %cmp = icmp ult i64 %idx.next, %count + br i1 %cmp, label %for.body, label %for.exit + +for.exit: + ret void +} + +define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i64 %count) #0 { +; COMMON-LABEL: many_mulvl1_addressing: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: mov x9, x1 +; COMMON-NEXT: ptrue p0.b +; COMMON-NEXT: add x10, x0, x2 +; COMMON-NEXT: inch x9 +; COMMON-NEXT: ptrue p1.h +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: addvl x11, x10, #1 +; COMMON-NEXT: addvl x12, x0, #1 +; COMMON-NEXT: .LBB1_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; COMMON-NEXT: ld1b { z1.b }, p0/z, [x10, x8] +; COMMON-NEXT: subs x3, x3, #1 +; COMMON-NEXT: ld1b { z2.b }, p0/z, [x12, x8] +; COMMON-NEXT: ld1b { z3.b }, p0/z, [x11, x8] +; COMMON-NEXT: add z0.b, z0.b, z1.b +; COMMON-NEXT: add z1.b, z2.b, z3.b +; COMMON-NEXT: st1b { z0.h }, p1, [x1, x8] +; COMMON-NEXT: st1b { z1.h }, p1, [x9, x8] +; COMMON-NEXT: addvl x8, x8, #2 +; COMMON-NEXT: b.ne .LBB1_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %mul = shl nuw nsw i64 %vscale, 5 + br label %for.body + +for.body: + %src_row_addr = phi ptr [ %src_rows, %entry ], [ %add_ptr_src, %for.body ] + %dst_row_addr = phi ptr [ %dst_rows, %entry ], [ %add_ptr_dst, %for.body ] + %idx = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %2 = load , ptr %src_row_addr + %3 = getelementptr , ptr %src_row_addr, i64 1 + %4 = load , ptr %3 + %arrayidx2 = getelementptr inbounds i8, ptr %src_row_addr, i64 %stride + %5 = load , ptr %arrayidx2 + %6 = getelementptr , ptr %arrayidx2, i64 1 + %7 = load , ptr %6 + %8 = add %2, %5 + %9 = add %4, %7 + %10 = bitcast %8 to + %11 = trunc %10 to + store %11, ptr %dst_row_addr + %12 = bitcast %9 to + %13 = getelementptr , ptr %dst_row_addr, i64 1 + %14 = trunc %12 to + store %14, ptr %13 + %add_ptr_src = getelementptr inbounds i8, ptr %src_row_addr, i64 %mul + %add_ptr_dst = getelementptr inbounds i8, ptr %dst_row_addr, i64 %mul + %inc = add nuw i64 %idx, 1 + %exitcond = icmp eq i64 %inc, %count + br i1 %exitcond, label %for.exit, label %for.body + +for.exit: + ret void +} + +define void @fixed_iv_scalable_offset(ptr %src, ptr %dst, i64 %count) #0 { +; BASE-LABEL: fixed_iv_scalable_offset: +; BASE: // %bb.0: // %entry +; BASE-NEXT: ptrue p0.s +; BASE-NEXT: .LBB2_1: // %for.body +; BASE-NEXT: // =>This Inner Loop Header: Depth=1 +; BASE-NEXT: ld1w { z0.s }, p0/z, [x0] +; BASE-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; BASE-NEXT: subs x2, x2, #4 +; BASE-NEXT: add x0, x0, #16 +; BASE-NEXT: add z0.s, z0.s, z1.s +; BASE-NEXT: st1w { z0.s }, p0, [x1] +; BASE-NEXT: add x1, x1, #16 +; BASE-NEXT: b.ne .LBB2_1 +; BASE-NEXT: // %bb.2: // %for.exit +; BASE-NEXT: ret +; +; PREINDEX-LABEL: fixed_iv_scalable_offset: +; PREINDEX: // %bb.0: // %entry +; PREINDEX-NEXT: ptrue p0.s +; PREINDEX-NEXT: .LBB2_1: // %for.body +; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] +; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; PREINDEX-NEXT: subs x2, x2, #4 +; PREINDEX-NEXT: add x0, x0, #16 +; PREINDEX-NEXT: add z0.s, z0.s, z1.s +; PREINDEX-NEXT: st1w { z0.s }, p0, [x1] +; PREINDEX-NEXT: add x1, x1, #16 +; PREINDEX-NEXT: b.ne .LBB2_1 +; PREINDEX-NEXT: // %bb.2: // %for.exit +; PREINDEX-NEXT: ret +; +; POSTINDEX-LABEL: fixed_iv_scalable_offset: +; POSTINDEX: // %bb.0: // %entry +; POSTINDEX-NEXT: ptrue p0.s +; POSTINDEX-NEXT: addvl x8, x0, #4 +; POSTINDEX-NEXT: .LBB2_1: // %for.body +; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] +; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x8] +; POSTINDEX-NEXT: subs x2, x2, #4 +; POSTINDEX-NEXT: add x8, x8, #16 +; POSTINDEX-NEXT: add x0, x0, #16 +; POSTINDEX-NEXT: add z0.s, z0.s, z1.s +; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1] +; POSTINDEX-NEXT: add x1, x1, #16 +; POSTINDEX-NEXT: b.ne .LBB2_1 +; POSTINDEX-NEXT: // %bb.2: // %for.exit +; POSTINDEX-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %mul = shl nuw nsw i64 %vscale, 4 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv + %data = load , ptr %src.ptr + %src.ptr.offset = getelementptr inbounds i32, ptr %src.ptr, i64 %mul + %data2 = load , ptr %src.ptr.offset + %add = add %data, %data2 + %dst.ptr = getelementptr i32, ptr %dst, i64 %iv + store %add, ptr %dst.ptr + %inc = add nuw i64 %iv, 4 + %exit.cond = icmp eq i64 %inc, %count + br i1 %exit.cond, label %for.exit, label %for.body + +for.exit: + ret void +} + +define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 { +; COMMON-LABEL: mixed_offsets_scalable_then_fixed: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: ptrue p0.s +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: addvl x9, x0, #4 +; COMMON-NEXT: mov x10, #8 // =0x8 +; COMMON-NEXT: .LBB3_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; COMMON-NEXT: ld1w { z1.s }, p0/z, [x9, x8, lsl #2] +; COMMON-NEXT: add x11, x9, x8, lsl #2 +; COMMON-NEXT: ld1w { z2.s }, p0/z, [x11, x10, lsl #2] +; COMMON-NEXT: add z0.s, z0.s, z1.s +; COMMON-NEXT: add z0.s, z0.s, z2.s +; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; COMMON-NEXT: incw x8 +; COMMON-NEXT: cmp x2, x8 +; COMMON-NEXT: b.ne .LBB3_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %mul = shl nuw nsw i64 %vscale, 4 + %vl = shl nuw nsw i64 %vscale, 2 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv + %data = load , ptr %src.ptr + %src.ptr.sc_off = getelementptr inbounds i32, ptr %src.ptr, i64 %mul + %data2 = load , ptr %src.ptr.sc_off + %src.ptr.fx_off = getelementptr inbounds i32, ptr %src.ptr.sc_off, i64 8 + %data3 = load , ptr %src.ptr.fx_off + %add = add %data, %data2 + %add2 = add %add, %data3 + %dst.ptr = getelementptr i32, ptr %dst, i64 %iv + store %add2, ptr %dst.ptr + %inc = add nuw i64 %iv, %vl + %exit.cond = icmp eq i64 %inc, %count + br i1 %exit.cond, label %for.exit, label %for.body + +for.exit: + ret void +} + +define void @mixed_offsets_fixed_then_scalable(ptr %src, ptr %dst, i64 %count) #0 { +; COMMON-LABEL: mixed_offsets_fixed_then_scalable: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: addvl x9, x0, #4 +; COMMON-NEXT: ptrue p0.s +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: add x9, x9, #32 +; COMMON-NEXT: mov x10, #8 // =0x8 +; COMMON-NEXT: .LBB4_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: add x11, x0, x8, lsl #2 +; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; COMMON-NEXT: ld1w { z2.s }, p0/z, [x9, x8, lsl #2] +; COMMON-NEXT: ld1w { z1.s }, p0/z, [x11, x10, lsl #2] +; COMMON-NEXT: add z0.s, z0.s, z1.s +; COMMON-NEXT: add z0.s, z0.s, z2.s +; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; COMMON-NEXT: incw x8 +; COMMON-NEXT: cmp x2, x8 +; COMMON-NEXT: b.ne .LBB4_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %mul = shl nuw nsw i64 %vscale, 4 + %vl = shl nuw nsw i64 %vscale, 2 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv + %data = load , ptr %src.ptr + %src.ptr.fx_off = getelementptr inbounds i32, ptr %src.ptr, i64 8 + %data2 = load , ptr %src.ptr.fx_off + %src.ptr.sc_off = getelementptr inbounds i32, ptr %src.ptr.fx_off, i64 %mul + %data3 = load , ptr %src.ptr.sc_off + %add = add %data, %data2 + %add2 = add %add, %data3 + %dst.ptr = getelementptr i32, ptr %dst, i64 %iv + store %add2, ptr %dst.ptr + %inc = add nuw i64 %iv, %vl + %exit.cond = icmp eq i64 %inc, %count + br i1 %exit.cond, label %for.exit, label %for.body + +for.exit: + ret void +} + +;; FIXME: There's an opportunity here (that we currently miss) to define the phi +;; on the middle access, and have negative and positive scalable immediates. +;; +;; Currently we generate a scalable offset for the load in range of the base, +;; and a register to store the offset for the access that's out of range of the +;; base (but in range of the other). +;; +define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { +; COMMON-LABEL: three_access_wide_gap: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: ptrue p0.s +; COMMON-NEXT: mov x8, xzr +; COMMON-NEXT: addvl x9, x0, #8 +; COMMON-NEXT: addvl x10, x0, #4 +; COMMON-NEXT: .LBB5_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; COMMON-NEXT: ld1w { z1.s }, p0/z, [x10, x8, lsl #2] +; COMMON-NEXT: ld1w { z2.s }, p0/z, [x9, x8, lsl #2] +; COMMON-NEXT: add z0.s, z0.s, z1.s +; COMMON-NEXT: add z0.s, z0.s, z2.s +; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; COMMON-NEXT: incw x8 +; COMMON-NEXT: cmp x2, x8 +; COMMON-NEXT: b.ne .LBB5_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %mul = mul nuw nsw i64 %vscale, 16 + %mul2 = mul nuw nsw i64 %vscale, 16 + %vl = mul nuw nsw i64 %vscale, 4 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %src.ptr = getelementptr inbounds i32, ptr %src, i64 %iv + %data = load , ptr %src.ptr + %src.ptr.sc_off = getelementptr inbounds i32, ptr %src.ptr, i64 %mul + %data2 = load , ptr %src.ptr.sc_off + %src.ptr.sc_off2 = getelementptr inbounds i32, ptr %src.ptr.sc_off, i64 %mul2 + %data3 = load , ptr %src.ptr.sc_off2 + %add = add %data, %data2 + %add2 = add %add, %data3 + %dst.ptr = getelementptr i32, ptr %dst, i64 %iv + store %add2, ptr %dst.ptr + %inc = add nuw i64 %iv, %vl + %exit.cond = icmp eq i64 %inc, %count + br i1 %exit.cond, label %for.exit, label %for.body + +for.exit: + ret void +} + +attributes #0 = { "target-features"="+sve2" vscale_range(1,16) } From 557a0970eef95b33e30264433bb5f3350f1fdf70 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 15 May 2024 11:06:24 +0000 Subject: [PATCH 2/4] Convert LSR to use possibly-scalable Immediate type --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 413 +++++++++++------- 1 file changed, 257 insertions(+), 156 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 3a98e257367b2..0b77b7b5a62de 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -247,6 +247,68 @@ class RegSortData { void dump() const; }; +// An offset from an address that is either scalable or fixed. Used for +// per-target optimizations of addressing modes. +class Immediate : public details::FixedOrScalableQuantity { + constexpr Immediate(ScalarTy MinVal, bool Scalable) + : FixedOrScalableQuantity(MinVal, Scalable) {} + + constexpr Immediate(const FixedOrScalableQuantity &V) + : FixedOrScalableQuantity(V) {} + +public: + constexpr Immediate() : FixedOrScalableQuantity() {} + + static constexpr Immediate getFixed(ScalarTy MinVal) { + return Immediate(MinVal, false); + } + static constexpr Immediate getScalable(ScalarTy MinVal) { + return Immediate(MinVal, true); + } + static constexpr Immediate get(ScalarTy MinVal, bool Scalable) { + return Immediate(MinVal, Scalable); + } + + constexpr bool isLessThanZero() const { return Quantity < 0; } + + constexpr bool isGreaterThanZero() const { return Quantity > 0; } + + constexpr bool isMin() const { + return Quantity == std::numeric_limits::min(); + } + + constexpr bool isMax() const { + return Quantity == std::numeric_limits::max(); + } +}; + +// This is needed for the Compare type of std::map when Immediate is used +// as a key. We don't need it to be fully correct against any value of vscale, +// just to make sure that vscale-related terms in the map are considered against +// each other rather than being mixed up and potentially missing opportunities. +struct KeyOrderTargetImmediate { + bool operator()(const Immediate &LHS, const Immediate &RHS) const { + if (LHS.isScalable() && !RHS.isScalable()) + return false; + if (!LHS.isScalable() && RHS.isScalable()) + return true; + return LHS.getKnownMinValue() < RHS.getKnownMinValue(); + } +}; + +// This would be nicer if we could be generic instead of directly using size_t, +// but there doesn't seem to be a type trait for is_orderable or +// is_lessthan_comparable or similar. +struct KeyOrderSizeTAndImmediate { + bool operator()(const std::pair &LHS, + const std::pair &RHS) const { + size_t LSize = LHS.first; + size_t RSize = RHS.first; + if (LSize != RSize) + return LSize < RSize; + return KeyOrderTargetImmediate()(LHS.second, RHS.second); + } +}; } // end anonymous namespace #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -357,7 +419,7 @@ struct Formula { GlobalValue *BaseGV = nullptr; /// Base offset for complex addressing. - int64_t BaseOffset = 0; + Immediate BaseOffset; /// Whether any complex addressing has a base register. bool HasBaseReg = false; @@ -388,7 +450,7 @@ struct Formula { /// An additional constant offset which added near the use. This requires a /// temporary register, but the offset itself can live in an add immediate /// field rather than a register. - int64_t UnfoldedOffset = 0; + Immediate UnfoldedOffset; Formula() = default; @@ -628,7 +690,7 @@ void Formula::print(raw_ostream &OS) const { if (!First) OS << " + "; else First = false; BaseGV->printAsOperand(OS, /*PrintType=*/false); } - if (BaseOffset != 0) { + if (BaseOffset.isNonZero()) { if (!First) OS << " + "; else First = false; OS << BaseOffset; } @@ -652,7 +714,7 @@ void Formula::print(raw_ostream &OS) const { OS << ""; OS << ')'; } - if (UnfoldedOffset != 0) { + if (UnfoldedOffset.isNonZero()) { if (!First) OS << " + "; OS << "imm(" << UnfoldedOffset << ')'; } @@ -798,28 +860,28 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, /// If S involves the addition of a constant integer value, return that integer /// value, and mutate S to point to a new SCEV with that value excluded. -static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { +static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast(S)) { if (C->getAPInt().getSignificantBits() <= 64) { S = SE.getConstant(C->getType(), 0); - return C->getValue()->getSExtValue(); + return Immediate::getFixed(C->getValue()->getSExtValue()); } } else if (const SCEVAddExpr *Add = dyn_cast(S)) { SmallVector NewOps(Add->operands()); - int64_t Result = ExtractImmediate(NewOps.front(), SE); - if (Result != 0) + Immediate Result = ExtractImmediate(NewOps.front(), SE); + if (Result.isNonZero()) S = SE.getAddExpr(NewOps); return Result; } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { SmallVector NewOps(AR->operands()); - int64_t Result = ExtractImmediate(NewOps.front(), SE); - if (Result != 0) + Immediate Result = ExtractImmediate(NewOps.front(), SE); + if (Result.isNonZero()) S = SE.getAddRecExpr(NewOps, AR->getLoop(), // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) SCEV::FlagAnyWrap); return Result; } - return 0; + return Immediate(); } /// If S involves the addition of a GlobalValue address, return that symbol, and @@ -1134,7 +1196,7 @@ struct LSRFixup { /// A constant offset to be added to the LSRUse expression. This allows /// multiple fixups to share the same LSRUse with different offsets, for /// example in an unrolled loop. - int64_t Offset = 0; + Immediate Offset; LSRFixup() = default; @@ -1197,8 +1259,10 @@ class LSRUse { SmallVector Fixups; /// Keep track of the min and max offsets of the fixups. - int64_t MinOffset = std::numeric_limits::max(); - int64_t MaxOffset = std::numeric_limits::min(); + Immediate MinOffset = + Immediate::getFixed(std::numeric_limits::max()); + Immediate MaxOffset = + Immediate::getFixed(std::numeric_limits::min()); /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. @@ -1234,9 +1298,9 @@ class LSRUse { void pushFixup(LSRFixup &f) { Fixups.push_back(f); - if (f.Offset > MaxOffset) + if (Immediate::isKnownGT(f.Offset, MaxOffset)) MaxOffset = f.Offset; - if (f.Offset < MinOffset) + if (Immediate::isKnownLT(f.Offset, MinOffset)) MinOffset = f.Offset; } @@ -1254,7 +1318,7 @@ class LSRUse { static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, Instruction *Fixup = nullptr, int64_t ScalableOffset = 0); @@ -1311,7 +1375,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, // addressing. if (AMK == TTI::AMK_PreIndexed) { if (auto *Step = dyn_cast(AR->getStepRecurrence(*SE))) - if (Step->getAPInt() == F.BaseOffset) + if (Step->getAPInt() == F.BaseOffset.getFixedValue()) LoopCost = 0; } else if (AMK == TTI::AMK_PostIndexed) { const SCEV *LoopStep = AR->getStepRecurrence(*SE); @@ -1402,24 +1466,25 @@ void Cost::RateFormula(const Formula &F, // allows to fold 2 registers. C.NumBaseAdds += NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F))); - C.NumBaseAdds += (F.UnfoldedOffset != 0); + C.NumBaseAdds += (F.UnfoldedOffset.isNonZero()); // Accumulate non-free scaling amounts. C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue(); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { - int64_t O = Fixup.Offset; - int64_t Offset = (uint64_t)O + F.BaseOffset; + Immediate O = Fixup.Offset; + Immediate Offset = Immediate::getFixed((uint64_t)O.getFixedValue() + + F.BaseOffset.getFixedValue()); if (F.BaseGV) C.ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. - else if (Offset != 0) - C.ImmCost += APInt(64, Offset, true).getSignificantBits(); + else if (Offset.isNonZero()) + C.ImmCost += APInt(64, Offset.getFixedValue(), true).getSignificantBits(); // Check with target if this offset with this instruction is // specifically not supported. - if (LU.Kind == LSRUse::Address && Offset != 0 && + if (LU.Kind == LSRUse::Address && Offset.isNonZero() && !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) C.NumBaseAdds++; @@ -1547,7 +1612,7 @@ void LSRFixup::print(raw_ostream &OS) const { PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); } - if (Offset != 0) + if (Offset.isNonZero()) OS << ", Offset=" << Offset; } @@ -1674,15 +1739,16 @@ LLVM_DUMP_METHOD void LSRUse::dump() const { static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, Instruction *Fixup /* = nullptr */, int64_t ScalableOffset) { switch (Kind) { case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, - HasBaseReg, Scale, AccessTy.AddrSpace, - Fixup, ScalableOffset); + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, + BaseOffset.getFixedValue(), HasBaseReg, + Scale, AccessTy.AddrSpace, Fixup, + ScalableOffset); case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to @@ -1691,7 +1757,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, return false; // ICmp only has two operands; don't allow more than two non-trivial parts. - if (Scale != 0 && HasBaseReg && BaseOffset != 0) + if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero()) return false; // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by @@ -1701,7 +1767,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. - if (BaseOffset != 0) { + if (BaseOffset.isNonZero()) { // We have one of: // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset @@ -1709,8 +1775,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, if (Scale == 0) // The cast does the right thing with // std::numeric_limits::min(). - BaseOffset = -(uint64_t)BaseOffset; - return TTI.isLegalICmpImmediate(BaseOffset); + BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue()); + return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue()); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1718,31 +1784,34 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, case LSRUse::Basic: // Only handle single-register values. - return !BaseGV && Scale == 0 && BaseOffset == 0 && ScalableOffset == 0; + return !BaseGV && Scale == 0 && BaseOffset.isZero(); case LSRUse::Special: // Special case Basic to handle -1 scales. - return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0 && - ScalableOffset == 0; + return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero(); } llvm_unreachable("Invalid LSRUse Kind!"); } static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - int64_t MinOffset, int64_t MaxOffset, + Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { + int64_t Base = BaseOffset.getFixedValue(); + int64_t Min = MinOffset.getFixedValue(); + int64_t Max = MaxOffset.getFixedValue(); // Check for overflow. - if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != - (MinOffset > 0)) + if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0)) return false; - MinOffset = (uint64_t)BaseOffset + MinOffset; - if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != - (MaxOffset > 0)) + Min = (uint64_t)Base + Min; + if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0)) return false; - MaxOffset = (uint64_t)BaseOffset + MaxOffset; + Max = (uint64_t)Base + Max; + + MinOffset = Immediate::getFixed(Min); + MaxOffset = Immediate::getFixed(Max); return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, Scale) && @@ -1751,7 +1820,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - int64_t MinOffset, int64_t MaxOffset, + Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F, const Loop &L) { // For the purpose of isAMCompletelyFolded either having a canonical formula @@ -1767,10 +1836,10 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } /// Test whether we know how to expand the current formula. -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, +static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, - int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { + Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1781,8 +1850,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, BaseGV, BaseOffset, true, 0)); } -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, +static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); @@ -1819,15 +1888,15 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { + int64_t FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue(); + int64_t FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue(); // Check the scaling factor cost with both the min and max offsets. InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, - StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg, - F.Scale, LU.AccessTy.AddrSpace); + LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMin), + F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, - StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg, - F.Scale, LU.AccessTy.AddrSpace); + LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMax), + F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() && "Legal addressing mode has an illegal cost!"); @@ -1846,10 +1915,11 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t ScalableOffset = 0) { // Fast-path: zero is always foldable. - if (BaseOffset == 0 && !BaseGV) return true; + if (BaseOffset.isZero() && !BaseGV) + return true; // Conservatively, create an address with an immediate and a // base and a scale. @@ -1867,8 +1937,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - ScalarEvolution &SE, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, + ScalarEvolution &SE, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const SCEV *S, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1876,14 +1946,15 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, // Conservatively, create an address with an immediate and a // base and a scale. - int64_t BaseOffset = ExtractImmediate(S, SE); + Immediate BaseOffset = ExtractImmediate(S, SE); GlobalValue *BaseGV = ExtractSymbol(S, SE); // If there's anything else involved, it's not foldable. if (!S->isZero()) return false; // Fast-path: zero is always foldable. - if (BaseOffset == 0 && !BaseGV) return true; + if (BaseOffset.isZero() && !BaseGV) + return true; // Conservatively, create an address with an immediate and a // base and a scale. @@ -2032,11 +2103,11 @@ class LSRInstance { using UseMapTy = DenseMap; UseMapTy UseMap; - bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, - MemAccessTy AccessTy); + std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -2062,7 +2133,7 @@ class LSRInstance { void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, const Formula &Base, - const SmallVectorImpl &Worklist, + const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg = false); void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); @@ -2570,11 +2641,11 @@ LSRInstance::OptimizeLoopTermCond() { /// Determine if the given use can accommodate a fixup at the given offset and /// other details. If so, update the use and return true. -bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, +bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy) { - int64_t NewMinOffset = LU.MinOffset; - int64_t NewMaxOffset = LU.MaxOffset; + Immediate NewMinOffset = LU.MinOffset; + Immediate NewMaxOffset = LU.MaxOffset; MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to @@ -2594,12 +2665,12 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, } // Conservatively assume HasBaseReg is true for now. - if (NewOffset < LU.MinOffset) { + if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) { if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; - } else if (NewOffset > LU.MaxOffset) { + } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) { if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, NewOffset - LU.MinOffset, HasBaseReg)) return false; @@ -2616,17 +2687,17 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, /// Return an LSRUse index and an offset value for a fixup which needs the given /// expression, with the given kind and optional access type. Either reuse an /// existing use or create a new one, as needed. -std::pair LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - MemAccessTy AccessTy) { +std::pair LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; - int64_t Offset = ExtractImmediate(Expr, SE); + Immediate Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, Offset, /*HasBaseReg=*/ true)) { Expr = Copy; - Offset = 0; + Offset = Immediate::getFixed(0); } std::pair P = @@ -2687,7 +2758,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, F.BaseGV == OrigF.BaseGV && F.Scale == OrigF.Scale && F.UnfoldedOffset == OrigF.UnfoldedOffset) { - if (F.BaseOffset == 0) + if (F.BaseOffset.isZero()) return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we @@ -3192,7 +3263,8 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, - IncOffset, /*HasBaseReg=*/false, ScalableOffset)) + Immediate::getFixed(IncOffset), /*HasBaseReg=*/false, + ScalableOffset)) return false; return true; @@ -3424,9 +3496,9 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { } // Get or create an LSRUse. - std::pair P = getUse(S, Kind, AccessTy); + std::pair P = getUse(S, Kind, AccessTy); size_t LUIdx = P.first; - int64_t Offset = P.second; + Immediate Offset = P.second; LSRUse &LU = Uses[LUIdx]; // Record the fixup. @@ -3616,10 +3688,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { continue; } - std::pair P = getUse( - S, LSRUse::Basic, MemAccessTy()); + std::pair P = + getUse(S, LSRUse::Basic, MemAccessTy()); size_t LUIdx = P.first; - int64_t Offset = P.second; + Immediate Offset = P.second; LSRUse &LU = Uses[LUIdx]; LSRFixup &LF = LU.getNewFixup(); LF.UserInst = const_cast(UserInst); @@ -3778,10 +3850,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast(InnerSum); if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = - (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); + Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + + InnerSumSC->getValue()->getZExtValue()); if (IsScaledReg) F.ScaledReg = nullptr; else @@ -3794,10 +3867,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast(*J); if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() + SC->getValue()->getZExtValue())) F.UnfoldedOffset = - (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); + Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + + SC->getValue()->getZExtValue()); else F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the @@ -3838,7 +3912,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. if (Base.BaseRegs.size() + (Base.Scale == 1) + - (Base.UnfoldedOffset != 0) <= 1) + (Base.UnfoldedOffset.isNonZero()) <= + 1) return; // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before @@ -3889,9 +3964,9 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // registers collected. if (NewBase.UnfoldedOffset) { assert(CombinedIntegerType && "Missing a type for the unfolded offset"); - Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, - true)); - NewBase.UnfoldedOffset = 0; + Ops.push_back(SE.getConstant(CombinedIntegerType, + NewBase.UnfoldedOffset.getFixedValue(), true)); + NewBase.UnfoldedOffset = Immediate::getFixed(0); GenerateFormula(SE.getAddExpr(Ops)); } } @@ -3931,15 +4006,17 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, /// Helper function for LSRInstance::GenerateConstantOffsets. void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, - const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg) { + const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg) { - auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { + auto GenerateOffset = [&](const SCEV *G, Immediate Offset) { Formula F = Base; - F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + F.BaseOffset = Immediate::getFixed( + (uint64_t)Base.BaseOffset.getFixedValue() - Offset.getFixedValue()); if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G); + const SCEV *NewG = SE.getAddExpr( + SE.getConstant(G->getType(), Offset.getFixedValue()), G); // If it cancelled out, drop the base register, otherwise update it. if (NewG->isZero()) { if (IsScaledReg) { @@ -3975,21 +4052,22 @@ void LSRInstance::GenerateConstantOffsetsImpl( int64_t Step = StepInt.isNegative() ? StepInt.getSExtValue() : StepInt.getZExtValue(); - for (int64_t Offset : Worklist) { - Offset -= Step; + for (Immediate Offset : Worklist) { + Offset = Immediate::getFixed(Offset.getFixedValue() - Step); GenerateOffset(G, Offset); } } } } - for (int64_t Offset : Worklist) + for (Immediate Offset : Worklist) GenerateOffset(G, Offset); - int64_t Imm = ExtractImmediate(G, SE); - if (G->isZero() || Imm == 0) + Immediate Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm.isZero()) return; Formula F = Base; - F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + + Imm.getFixedValue()); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) return; if (IsScaledReg) { @@ -4008,7 +4086,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: For now, just add the min and max offset, because it usually isn't // worthwhile looking at everything inbetween. - SmallVector Worklist; + SmallVector Worklist; Worklist.push_back(LU.MinOffset); if (LU.MaxOffset != LU.MinOffset) Worklist.push_back(LU.MaxOffset); @@ -4048,27 +4126,29 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (!ConstantInt::isValueValidForType(IntTy, Factor)) continue; // Check that the multiplication doesn't overflow. - if (Base.BaseOffset == std::numeric_limits::min() && Factor == -1) + if (Base.BaseOffset.isMin() && Factor == -1) continue; - int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; + Immediate NewBaseOffset = + Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor); assert(Factor != 0 && "Zero factor not expected!"); - if (NewBaseOffset / Factor != Base.BaseOffset) + if (NewBaseOffset.getFixedValue() / Factor != + Base.BaseOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, NewBaseOffset)) + !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue())) continue; // Check that multiplying with the use offset doesn't overflow. - int64_t Offset = LU.MinOffset; - if (Offset == std::numeric_limits::min() && Factor == -1) + Immediate Offset = LU.MinOffset; + if (Offset.isMin() && Factor == -1) continue; - Offset = (uint64_t)Offset * Factor; - if (Offset / Factor != LU.MinOffset) + Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor); + if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, Offset)) + !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue())) continue; Formula F = Base; @@ -4079,7 +4159,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; // Compensate for the use having MinOffset built into it. - F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; + F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + + Offset.getFixedValue() - + LU.MinOffset.getFixedValue()); const SCEV *FactorS = SE.getConstant(IntTy, Factor); @@ -4098,16 +4180,17 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, } // Check that multiplying with the unfolded offset doesn't overflow. - if (F.UnfoldedOffset != 0) { - if (F.UnfoldedOffset == std::numeric_limits::min() && - Factor == -1) + if (F.UnfoldedOffset.isNonZero()) { + if (F.UnfoldedOffset.isMin() && Factor == -1) continue; - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; - if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + F.UnfoldedOffset = Immediate::getFixed( + (uint64_t)F.UnfoldedOffset.getFixedValue() * Factor); + if (F.UnfoldedOffset.getFixedValue() / Factor != + Base.UnfoldedOffset.getFixedValue()) continue; // If the offset will be truncated, check that it is in bounds. - if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset)) + if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType( + IntTy, F.UnfoldedOffset.getFixedValue())) continue; } @@ -4150,8 +4233,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { } // For an ICmpZero, negating a solitary base register won't lead to // new solutions. - if (LU.Kind == LSRUse::ICmpZero && - !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) + if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg && + Base.BaseOffset.isZero() && !Base.BaseGV) continue; // For each addrec base reg, if its loop is current loop, apply the scale. for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { @@ -4277,10 +4360,10 @@ namespace { /// structures moving underneath it. struct WorkItem { size_t LUIdx; - int64_t Imm; + Immediate Imm; const SCEV *OrigReg; - WorkItem(size_t LI, int64_t I, const SCEV *R) + WorkItem(size_t LI, Immediate I, const SCEV *R) : LUIdx(LI), Imm(I), OrigReg(R) {} void print(raw_ostream &OS) const; @@ -4304,14 +4387,14 @@ LLVM_DUMP_METHOD void WorkItem::dump() const { /// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. - using ImmMapTy = std::map; + using ImmMapTy = std::map; DenseMap Map; DenseMap UsedByIndicesMap; SmallVector Sequence; for (const SCEV *Use : RegUses) { const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify. - int64_t Imm = ExtractImmediate(Reg, SE); + Immediate Imm = ExtractImmediate(Reg, SE); auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy())); if (Pair.second) Sequence.push_back(Reg); @@ -4323,7 +4406,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // a list of work to do and do the work in a separate step so that we're // not adding formulae and register counts while we're searching. SmallVector WorkItems; - SmallSet, 32> UniqueItems; + SmallSet, 32, KeyOrderSizeTAndImmediate> + UniqueItems; for (const SCEV *Reg : Sequence) { const ImmMapTy &Imms = Map.find(Reg)->second; @@ -4342,7 +4426,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { J != JE; ++J) { const SCEV *OrigReg = J->second; - int64_t JImm = J->first; + Immediate JImm = J->first; const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); if (!isa(OrigReg) && @@ -4354,8 +4438,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. - int64_t First = Imms.begin()->first; - int64_t Last = std::prev(Imms.end())->first; + int64_t First = Imms.begin()->first.getFixedValue(); + int64_t Last = std::prev(Imms.end())->first.getFixedValue(); // Compute (First + Last) / 2 without overflow using the fact that // First + Last = 2 * (First + Last) + (First ^ Last). int64_t Avg = (First & Last) + ((First ^ Last) >> 1); @@ -4364,12 +4448,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63)); ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), std::prev(Imms.end()), - Imms.lower_bound(Avg)}; + Imms.lower_bound(Immediate::getFixed(Avg))}; for (const auto &M : OtherImms) { if (M == J || M == JE) continue; // Compute the difference between the two. - int64_t Imm = (uint64_t)JImm - M->first; + Immediate Imm = Immediate::getFixed((uint64_t)JImm.getFixedValue() - + M->first.getFixedValue()); for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) @@ -4387,11 +4472,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { for (const WorkItem &WI : WorkItems) { size_t LUIdx = WI.LUIdx; LSRUse &LU = Uses[LUIdx]; - int64_t Imm = WI.Imm; + Immediate Imm = WI.Imm; const SCEV *OrigReg = WI.OrigReg; Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); - const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); + const SCEV *NegImmS = + SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getFixedValue())); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); // TODO: Use a more targeted data structure. @@ -4404,10 +4490,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { - int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; + Immediate Offset = + Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + + Imm.getFixedValue() * (uint64_t)F.Scale); // Don't create 50 + reg(-50). if (F.referencesReg(SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offset)))) + ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue())))) continue; Formula NewF = F; NewF.BaseOffset = Offset; @@ -4420,9 +4508,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast(NewF.ScaledReg)) - if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && + if (C->getValue()->isNegative() != + (NewF.BaseOffset.isLessThanZero()) && (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) - .ule(std::abs(NewF.BaseOffset))) + .ule(std::abs(NewF.BaseOffset.getFixedValue()))) continue; // OK, looks good. @@ -4435,16 +4524,21 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (BaseReg != OrigReg) continue; Formula NewF = F; - NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; + NewF.BaseOffset = Immediate::getFixed( + (uint64_t)NewF.BaseOffset.getFixedValue() + Imm.getFixedValue()); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) continue; - if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + if (!TTI.isLegalAddImmediate( + (uint64_t)NewF.UnfoldedOffset.getFixedValue() + + Imm.getFixedValue())) continue; NewF = F; - NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + NewF.UnfoldedOffset = Immediate::getFixed( + (uint64_t)NewF.UnfoldedOffset.getFixedValue() + + Imm.getFixedValue()); } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); @@ -4453,11 +4547,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) if (const SCEVConstant *C = dyn_cast(NewReg)) - if ((C->getAPInt() + NewF.BaseOffset) + if ((C->getAPInt() + NewF.BaseOffset.getFixedValue()) .abs() - .slt(std::abs(NewF.BaseOffset)) && - (C->getAPInt() + NewF.BaseOffset).countr_zero() >= - (unsigned)llvm::countr_zero(NewF.BaseOffset)) + .slt(std::abs(NewF.BaseOffset.getFixedValue())) && + (C->getAPInt() + NewF.BaseOffset.getFixedValue()) + .countr_zero() >= + (unsigned)llvm::countr_zero( + NewF.BaseOffset.getFixedValue())) goto skip_formula; // Ok, looks good. @@ -4651,7 +4747,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { Formula NewF = F; //FIXME: Formulas should store bitwidth to do wrapping properly. // See PR41034. - NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue(); + NewF.BaseOffset = + Immediate::getFixed(NewF.BaseOffset.getFixedValue() + + (uint64_t)C->getValue()->getSExtValue()); NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -4707,7 +4805,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; for (const Formula &F : LU.Formulae) { - if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) + if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1)) continue; LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); @@ -5543,30 +5641,33 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, } // Expand the immediate portion. - int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; - if (Offset != 0) { + Immediate Offset = Immediate::getFixed( + (uint64_t)F.BaseOffset.getFixedValue() + LF.Offset.getFixedValue()); + if (Offset.isNonZero()) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a // negated immediate. if (!ICmpScaledV) - ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); + ICmpScaledV = + ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()); else { Ops.push_back(SE.getUnknown(ICmpScaledV)); - ICmpScaledV = ConstantInt::get(IntTy, Offset); + ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue()); } } else { // Just add the immediate values. These again are expected to be matched // as part of the address. - Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); + Ops.push_back( + SE.getUnknown(ConstantInt::getSigned(IntTy, Offset.getFixedValue()))); } } // Expand the unfolded offset portion. - int64_t UnfoldedOffset = F.UnfoldedOffset; - if (UnfoldedOffset != 0) { + Immediate UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset.isNonZero()) { // Just add the immediate values. - Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, - UnfoldedOffset))); + Ops.push_back(SE.getUnknown( + ConstantInt::getSigned(IntTy, UnfoldedOffset.getFixedValue()))); } // Emit instructions summing all the operands. @@ -5602,7 +5703,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), - -(uint64_t)Offset); + -(uint64_t)Offset.getFixedValue()); if (C->getType() != OpTy) { C = ConstantFoldCastOperand( CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy, From 6265205f73b92bebdd6e17d43ac4ee610bd9a255 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 14 May 2024 16:04:13 +0000 Subject: [PATCH 3/4] Scalable work --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 245 +++++++++++++----- .../AArch64/vscale-fixups.ll | 245 ++++++++++-------- 2 files changed, 329 insertions(+), 161 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 0b77b7b5a62de..6fd1685a8fa76 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -197,6 +197,14 @@ static cl::opt AllowDropSolutionIfLessProfitable( "lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable")); +static cl::opt EnableVScaleImmediates( + "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), + cl::desc("Enable analysis of vscale-relative immediates in LSR")); + +static cl::opt DropScaledForVScale( + "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), + cl::desc("Avoid using scaled registers with vscale-relative addressing")); + STATISTIC(NumTermFold, "Number of terminating condition fold recognized and performed"); @@ -273,6 +281,10 @@ class Immediate : public details::FixedOrScalableQuantity { constexpr bool isGreaterThanZero() const { return Quantity > 0; } + constexpr bool isCompatibleImmediate(const Immediate &Imm) const { + return isZero() || Imm.isZero() || Imm.Scalable == Scalable; + } + constexpr bool isMin() const { return Quantity == std::numeric_limits::min(); } @@ -880,7 +892,13 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) SCEV::FlagAnyWrap); return Result; - } + } else if (EnableVScaleImmediates) + if (const SCEVMulExpr *M = dyn_cast(S)) + if (const SCEVConstant *C = dyn_cast(M->getOperand(0))) + if (isa(M->getOperand(1))) { + S = SE.getConstant(M->getType(), 0); + return Immediate::getScalable(C->getValue()->getSExtValue()); + } return Immediate(); } @@ -1373,7 +1391,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, // If the step size matches the base offset, we could use pre-indexed // addressing. - if (AMK == TTI::AMK_PreIndexed) { + if (AMK == TTI::AMK_PreIndexed && !F.BaseOffset.isScalable()) { if (auto *Step = dyn_cast(AR->getStepRecurrence(*SE))) if (Step->getAPInt() == F.BaseOffset.getFixedValue()) LoopCost = 0; @@ -1473,14 +1491,18 @@ void Cost::RateFormula(const Formula &F, // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { - Immediate O = Fixup.Offset; - Immediate Offset = Immediate::getFixed((uint64_t)O.getFixedValue() + - F.BaseOffset.getFixedValue()); + // FIXME: We probably want to noticeably increase the cost if the + // two offsets differ in scalability? + bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable(); + int64_t O = Fixup.Offset.getKnownMinValue(); + Immediate Offset = Immediate::get( + (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable); if (F.BaseGV) C.ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. else if (Offset.isNonZero()) - C.ImmCost += APInt(64, Offset.getFixedValue(), true).getSignificantBits(); + C.ImmCost += + APInt(64, Offset.getKnownMinValue(), true).getSignificantBits(); // Check with target if this offset with this instruction is // specifically not supported. @@ -1744,12 +1766,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, Instruction *Fixup /* = nullptr */, int64_t ScalableOffset) { switch (Kind) { - case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, - BaseOffset.getFixedValue(), HasBaseReg, - Scale, AccessTy.AddrSpace, Fixup, - ScalableOffset); - + case LSRUse::Address: { + int64_t FixedOffset = + BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue(); + int64_t ScalableOffset = + BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0; + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset, + HasBaseReg, Scale, AccessTy.AddrSpace, + Fixup, ScalableOffset); + } case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. @@ -1768,6 +1793,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (BaseOffset.isNonZero()) { + // We don't have an interface to query whether the target supports + // icmpzero against scalable quantities yet. + if (BaseOffset.isScalable()) + return false; + // We have one of: // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset @@ -1799,19 +1829,20 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { - int64_t Base = BaseOffset.getFixedValue(); - int64_t Min = MinOffset.getFixedValue(); - int64_t Max = MaxOffset.getFixedValue(); + if (BaseOffset.isNonZero() && + (BaseOffset.isScalable() != MinOffset.isScalable() || + BaseOffset.isScalable() != MaxOffset.isScalable())) + return false; // Check for overflow. + int64_t Base = BaseOffset.getKnownMinValue(); + int64_t Min = MinOffset.getKnownMinValue(); + int64_t Max = MaxOffset.getKnownMinValue(); if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0)) return false; - Min = (uint64_t)Base + Min; + MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable()); if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0)) return false; - Max = (uint64_t)Base + Max; - - MinOffset = Immediate::getFixed(Min); - MaxOffset = Immediate::getFixed(Max); + MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable()); return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, Scale) && @@ -1857,6 +1888,14 @@ static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegalAddImmediate(const TargetTransformInfo &TTI, + Immediate Offset) { + if (Offset.isScalable()) + return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue()); + + return TTI.isLegalAddImmediate(Offset.getFixedValue()); +} + static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F) { // Target may want to look at the user instructions. @@ -1888,14 +1927,20 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { - int64_t FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue(); - int64_t FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue(); // Check the scaling factor cost with both the min and max offsets. + int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0; + if (F.BaseOffset.isScalable()) { + ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue(); + ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue(); + } else { + FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue(); + FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue(); + } InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMin), + LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin), F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMax), + LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax), F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() && @@ -1932,6 +1977,15 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, HasBaseReg = true; } + // FIXME: Try with + without a scale? Maybe based on TTI? + // I think basereg + scaledreg + immediateoffset isn't a good 'conservative' + // default for many architectures, not just AArch64 SVE. More investigation + // needed later to determine if this should be used more widely than just + // on scalable types. + if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero && + AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale) + Scale = 0; + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale, nullptr, ScalableOffset); } @@ -1956,6 +2010,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, if (BaseOffset.isZero() && !BaseGV) return true; + if (BaseOffset.isScalable()) + return false; + // Conservatively, create an address with an immediate and a // base and a scale. int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; @@ -2677,6 +2734,13 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset, NewMaxOffset = NewOffset; } + // FIXME: We should be able to handle some level of scalable offset support + // for 'void', but in order to get basic support up and running this is + // being left out. + if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() && + (NewMinOffset.isScalable() || NewMaxOffset.isScalable())) + return false; + // Update the use. LU.MinOffset = NewMinOffset; LU.MaxOffset = NewMaxOffset; @@ -4010,13 +4074,22 @@ void LSRInstance::GenerateConstantOffsetsImpl( auto GenerateOffset = [&](const SCEV *G, Immediate Offset) { Formula F = Base; - F.BaseOffset = Immediate::getFixed( - (uint64_t)Base.BaseOffset.getFixedValue() - Offset.getFixedValue()); + if (Base.BaseOffset.isScalable() != Offset.isScalable() && + Base.BaseOffset.isNonZero() && Offset.isNonZero()) + return; + bool Scalable = Base.BaseOffset.isScalable() || Offset.isScalable(); + F.BaseOffset = Immediate::get((uint64_t)Base.BaseOffset.getKnownMinValue() - + Offset.getKnownMinValue(), + Scalable); if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr( - SE.getConstant(G->getType(), Offset.getFixedValue()), G); + const SCEV *NewOffset = + SE.getConstant(G->getType(), Offset.getKnownMinValue()); + if (Scalable) + NewOffset = + SE.getMulExpr(NewOffset, SE.getVScale(NewOffset->getType())); + const SCEV *NewG = SE.getAddExpr(NewOffset, G); // If it cancelled out, drop the base register, otherwise update it. if (NewG->isZero()) { if (IsScaledReg) { @@ -4053,8 +4126,10 @@ void LSRInstance::GenerateConstantOffsetsImpl( StepInt.getSExtValue() : StepInt.getZExtValue(); for (Immediate Offset : Worklist) { - Offset = Immediate::getFixed(Offset.getFixedValue() - Step); - GenerateOffset(G, Offset); + if (!Offset.isScalable()) { + Offset = Immediate::getFixed(Offset.getFixedValue() - Step); + GenerateOffset(G, Offset); + } } } } @@ -4063,11 +4138,13 @@ void LSRInstance::GenerateConstantOffsetsImpl( GenerateOffset(G, Offset); Immediate Imm = ExtractImmediate(G, SE); - if (G->isZero() || Imm.isZero()) + if (G->isZero() || Imm.isZero() || + Base.BaseOffset.isScalable() != Imm.isScalable()) return; Formula F = Base; - F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + - Imm.getFixedValue()); + F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + + Imm.getKnownMinValue(), + Imm.isScalable()); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) return; if (IsScaledReg) { @@ -4438,23 +4515,39 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. - int64_t First = Imms.begin()->first.getFixedValue(); - int64_t Last = std::prev(Imms.end())->first.getFixedValue(); + Immediate First = Imms.begin()->first; + Immediate Last = std::prev(Imms.end())->first; + if (First.isScalable() != Last.isScalable() && First.isNonZero() && + Last.isNonZero()) { + LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg + << "\n"); + continue; + } + // Only scalable if both terms are scalable, or if one is scalable and + // the other is 0. + bool Scalable = First.isScalable() || Last.isScalable(); + int64_t FI = First.getKnownMinValue(); + int64_t LI = Last.getKnownMinValue(); // Compute (First + Last) / 2 without overflow using the fact that // First + Last = 2 * (First + Last) + (First ^ Last). - int64_t Avg = (First & Last) + ((First ^ Last) >> 1); - // If the result is negative and First is odd and Last even (or vice versa), + int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1); + // If the result is negative and FI is odd and LI even (or vice versa), // we rounded towards -inf. Add 1 in that case, to round towards 0. - Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63)); + Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63)); ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), std::prev(Imms.end()), - Imms.lower_bound(Immediate::getFixed(Avg))}; + Imms.lower_bound(Immediate::get(Avg, Scalable))}; for (const auto &M : OtherImms) { if (M == J || M == JE) continue; + if (JImm.isScalable() != M->first.isScalable() && JImm.isNonZero() && + M->first.isNonZero()) + continue; // Compute the difference between the two. - Immediate Imm = Immediate::getFixed((uint64_t)JImm.getFixedValue() - - M->first.getFixedValue()); + bool Scalable = JImm.isScalable() || M->first.isScalable(); + Immediate Imm = Immediate::get((uint64_t)JImm.getKnownMinValue() - + M->first.getKnownMinValue(), + Scalable); for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) @@ -4477,7 +4570,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); const SCEV *NegImmS = - SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getFixedValue())); + SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getKnownMinValue())); + if (Imm.isScalable()) + NegImmS = SE.getMulExpr(NegImmS, SE.getVScale(NegImmS->getType())); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); // TODO: Use a more targeted data structure. @@ -4490,12 +4585,20 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { + if (F.BaseOffset.isScalable() != Imm.isScalable() && + F.BaseOffset.isNonZero() && Imm.isNonZero()) + continue; + bool Scalable = F.BaseOffset.isScalable() || Imm.isScalable(); Immediate Offset = - Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + - Imm.getFixedValue() * (uint64_t)F.Scale); + Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + + Imm.getKnownMinValue() * (uint64_t)F.Scale, + Scalable); // Don't create 50 + reg(-50). - if (F.referencesReg(SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue())))) + const SCEV *S = SE.getSCEV( + ConstantInt::get(IntTy, -(uint64_t)Offset.getKnownMinValue())); + if (Scalable) + S = SE.getMulExpr(S, SE.getVScale(S->getType())); + if (F.referencesReg(S)) continue; Formula NewF = F; NewF.BaseOffset = Offset; @@ -4524,21 +4627,29 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (BaseReg != OrigReg) continue; Formula NewF = F; - NewF.BaseOffset = Immediate::getFixed( - (uint64_t)NewF.BaseOffset.getFixedValue() + Imm.getFixedValue()); + if (!NewF.BaseOffset.isCompatibleImmediate(Imm) || + !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) || + !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset)) + continue; + bool Scalable = NewF.BaseOffset.isScalable() || Imm.isScalable() || + NewF.UnfoldedOffset.isScalable(); + NewF.BaseOffset = + Immediate::get((uint64_t)NewF.BaseOffset.getKnownMinValue() + + Imm.getKnownMinValue(), + Scalable); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) continue; - if (!TTI.isLegalAddImmediate( - (uint64_t)NewF.UnfoldedOffset.getFixedValue() + - Imm.getFixedValue())) + Immediate NewUnfoldedOffset = Immediate::get( + (uint64_t)NewF.UnfoldedOffset.getKnownMinValue() + + Imm.getKnownMinValue(), + Scalable); + if (!isLegalAddImmediate(TTI, NewUnfoldedOffset)) continue; NewF = F; - NewF.UnfoldedOffset = Immediate::getFixed( - (uint64_t)NewF.UnfoldedOffset.getFixedValue() + - Imm.getFixedValue()); + NewF.UnfoldedOffset = NewUnfoldedOffset; } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); @@ -5640,9 +5751,17 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, Ops.push_back(SE.getUnknown(FullV)); } + // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail + // out at this point, or should we generate a SCEV adding together mixed + // offsets? + assert((F.BaseOffset.isScalable() == LF.Offset.isScalable() || + F.BaseOffset.isZero() || LF.Offset.isZero()) && + "Expanding mismatched offsets\n"); + bool Scalable = F.BaseOffset.isScalable() || LF.Offset.isScalable(); // Expand the immediate portion. - Immediate Offset = Immediate::getFixed( - (uint64_t)F.BaseOffset.getFixedValue() + LF.Offset.getFixedValue()); + Immediate Offset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + + LF.Offset.getKnownMinValue(), + Scalable); if (Offset.isNonZero()) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a @@ -5657,17 +5776,23 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, } else { // Just add the immediate values. These again are expected to be matched // as part of the address. - Ops.push_back( - SE.getUnknown(ConstantInt::getSigned(IntTy, Offset.getFixedValue()))); + const SCEV *SU = SE.getUnknown( + ConstantInt::getSigned(IntTy, Offset.getKnownMinValue())); + if (Scalable) + SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); + Ops.push_back(SU); } } // Expand the unfolded offset portion. Immediate UnfoldedOffset = F.UnfoldedOffset; if (UnfoldedOffset.isNonZero()) { + const SCEV *SU = SE.getUnknown( + ConstantInt::getSigned(IntTy, UnfoldedOffset.getKnownMinValue())); + if (UnfoldedOffset.isScalable()) + SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); // Just add the immediate values. - Ops.push_back(SE.getUnknown( - ConstantInt::getSigned(IntTy, UnfoldedOffset.getFixedValue()))); + Ops.push_back(SU); } // Emit instructions summing all the operands. diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll index 367806ac438b0..483955c1c57a0 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -62,26 +62,22 @@ for.exit: define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i64 %count) #0 { ; COMMON-LABEL: many_mulvl1_addressing: ; COMMON: // %bb.0: // %entry -; COMMON-NEXT: mov x9, x1 ; COMMON-NEXT: ptrue p0.b -; COMMON-NEXT: add x10, x0, x2 -; COMMON-NEXT: inch x9 ; COMMON-NEXT: ptrue p1.h -; COMMON-NEXT: mov x8, xzr -; COMMON-NEXT: addvl x11, x10, #1 -; COMMON-NEXT: addvl x12, x0, #1 ; COMMON-NEXT: .LBB1_1: // %for.body ; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 -; COMMON-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; COMMON-NEXT: ld1b { z1.b }, p0/z, [x10, x8] +; COMMON-NEXT: add x8, x0, x2 +; COMMON-NEXT: ld1b { z0.b }, p0/z, [x0] +; COMMON-NEXT: ld1b { z1.b }, p0/z, [x0, x2] +; COMMON-NEXT: ld1b { z2.b }, p0/z, [x0, #1, mul vl] +; COMMON-NEXT: ld1b { z3.b }, p0/z, [x8, #1, mul vl] ; COMMON-NEXT: subs x3, x3, #1 -; COMMON-NEXT: ld1b { z2.b }, p0/z, [x12, x8] -; COMMON-NEXT: ld1b { z3.b }, p0/z, [x11, x8] +; COMMON-NEXT: addvl x0, x0, #2 ; COMMON-NEXT: add z0.b, z0.b, z1.b ; COMMON-NEXT: add z1.b, z2.b, z3.b -; COMMON-NEXT: st1b { z0.h }, p1, [x1, x8] -; COMMON-NEXT: st1b { z1.h }, p1, [x9, x8] -; COMMON-NEXT: addvl x8, x8, #2 +; COMMON-NEXT: st1b { z0.h }, p1, [x1] +; COMMON-NEXT: st1b { z1.h }, p1, [x1, #1, mul vl] +; COMMON-NEXT: addvl x1, x1, #2 ; COMMON-NEXT: b.ne .LBB1_1 ; COMMON-NEXT: // %bb.2: // %for.exit ; COMMON-NEXT: ret @@ -121,55 +117,21 @@ for.exit: } define void @fixed_iv_scalable_offset(ptr %src, ptr %dst, i64 %count) #0 { -; BASE-LABEL: fixed_iv_scalable_offset: -; BASE: // %bb.0: // %entry -; BASE-NEXT: ptrue p0.s -; BASE-NEXT: .LBB2_1: // %for.body -; BASE-NEXT: // =>This Inner Loop Header: Depth=1 -; BASE-NEXT: ld1w { z0.s }, p0/z, [x0] -; BASE-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] -; BASE-NEXT: subs x2, x2, #4 -; BASE-NEXT: add x0, x0, #16 -; BASE-NEXT: add z0.s, z0.s, z1.s -; BASE-NEXT: st1w { z0.s }, p0, [x1] -; BASE-NEXT: add x1, x1, #16 -; BASE-NEXT: b.ne .LBB2_1 -; BASE-NEXT: // %bb.2: // %for.exit -; BASE-NEXT: ret -; -; PREINDEX-LABEL: fixed_iv_scalable_offset: -; PREINDEX: // %bb.0: // %entry -; PREINDEX-NEXT: ptrue p0.s -; PREINDEX-NEXT: .LBB2_1: // %for.body -; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] -; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] -; PREINDEX-NEXT: subs x2, x2, #4 -; PREINDEX-NEXT: add x0, x0, #16 -; PREINDEX-NEXT: add z0.s, z0.s, z1.s -; PREINDEX-NEXT: st1w { z0.s }, p0, [x1] -; PREINDEX-NEXT: add x1, x1, #16 -; PREINDEX-NEXT: b.ne .LBB2_1 -; PREINDEX-NEXT: // %bb.2: // %for.exit -; PREINDEX-NEXT: ret -; -; POSTINDEX-LABEL: fixed_iv_scalable_offset: -; POSTINDEX: // %bb.0: // %entry -; POSTINDEX-NEXT: ptrue p0.s -; POSTINDEX-NEXT: addvl x8, x0, #4 -; POSTINDEX-NEXT: .LBB2_1: // %for.body -; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 -; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] -; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x8] -; POSTINDEX-NEXT: subs x2, x2, #4 -; POSTINDEX-NEXT: add x8, x8, #16 -; POSTINDEX-NEXT: add x0, x0, #16 -; POSTINDEX-NEXT: add z0.s, z0.s, z1.s -; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1] -; POSTINDEX-NEXT: add x1, x1, #16 -; POSTINDEX-NEXT: b.ne .LBB2_1 -; POSTINDEX-NEXT: // %bb.2: // %for.exit -; POSTINDEX-NEXT: ret +; COMMON-LABEL: fixed_iv_scalable_offset: +; COMMON: // %bb.0: // %entry +; COMMON-NEXT: ptrue p0.s +; COMMON-NEXT: .LBB2_1: // %for.body +; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0] +; COMMON-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; COMMON-NEXT: subs x2, x2, #4 +; COMMON-NEXT: add x0, x0, #16 +; COMMON-NEXT: add z0.s, z0.s, z1.s +; COMMON-NEXT: st1w { z0.s }, p0, [x1] +; COMMON-NEXT: add x1, x1, #16 +; COMMON-NEXT: b.ne .LBB2_1 +; COMMON-NEXT: // %bb.2: // %for.exit +; COMMON-NEXT: ret entry: %vscale = tail call i64 @llvm.vscale.i64() %mul = shl nuw nsw i64 %vscale, 4 @@ -193,26 +155,66 @@ for.exit: } define void @mixed_offsets_scalable_then_fixed(ptr %src, ptr %dst, i64 %count) #0 { -; COMMON-LABEL: mixed_offsets_scalable_then_fixed: -; COMMON: // %bb.0: // %entry -; COMMON-NEXT: ptrue p0.s -; COMMON-NEXT: mov x8, xzr -; COMMON-NEXT: addvl x9, x0, #4 -; COMMON-NEXT: mov x10, #8 // =0x8 -; COMMON-NEXT: .LBB3_1: // %for.body -; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 -; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; COMMON-NEXT: ld1w { z1.s }, p0/z, [x9, x8, lsl #2] -; COMMON-NEXT: add x11, x9, x8, lsl #2 -; COMMON-NEXT: ld1w { z2.s }, p0/z, [x11, x10, lsl #2] -; COMMON-NEXT: add z0.s, z0.s, z1.s -; COMMON-NEXT: add z0.s, z0.s, z2.s -; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; COMMON-NEXT: incw x8 -; COMMON-NEXT: cmp x2, x8 -; COMMON-NEXT: b.ne .LBB3_1 -; COMMON-NEXT: // %bb.2: // %for.exit -; COMMON-NEXT: ret +; BASE-LABEL: mixed_offsets_scalable_then_fixed: +; BASE: // %bb.0: // %entry +; BASE-NEXT: ptrue p0.s +; BASE-NEXT: addvl x8, x0, #4 +; BASE-NEXT: mov x9, #8 // =0x8 +; BASE-NEXT: .LBB3_1: // %for.body +; BASE-NEXT: // =>This Inner Loop Header: Depth=1 +; BASE-NEXT: ld1w { z0.s }, p0/z, [x8, #-4, mul vl] +; BASE-NEXT: ld1w { z1.s }, p0/z, [x8] +; BASE-NEXT: decw x2 +; BASE-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2] +; BASE-NEXT: addvl x8, x8, #1 +; BASE-NEXT: add z0.s, z0.s, z1.s +; BASE-NEXT: add z0.s, z0.s, z2.s +; BASE-NEXT: st1w { z0.s }, p0, [x1] +; BASE-NEXT: addvl x1, x1, #1 +; BASE-NEXT: cbnz x2, .LBB3_1 +; BASE-NEXT: // %bb.2: // %for.exit +; BASE-NEXT: ret +; +; PREINDEX-LABEL: mixed_offsets_scalable_then_fixed: +; PREINDEX: // %bb.0: // %entry +; PREINDEX-NEXT: ptrue p0.s +; PREINDEX-NEXT: addvl x8, x0, #4 +; PREINDEX-NEXT: mov x9, #8 // =0x8 +; PREINDEX-NEXT: .LBB3_1: // %for.body +; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x8, #-4, mul vl] +; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x8] +; PREINDEX-NEXT: decw x2 +; PREINDEX-NEXT: ld1w { z2.s }, p0/z, [x8, x9, lsl #2] +; PREINDEX-NEXT: addvl x8, x8, #1 +; PREINDEX-NEXT: add z0.s, z0.s, z1.s +; PREINDEX-NEXT: add z0.s, z0.s, z2.s +; PREINDEX-NEXT: st1w { z0.s }, p0, [x1] +; PREINDEX-NEXT: addvl x1, x1, #1 +; PREINDEX-NEXT: cbnz x2, .LBB3_1 +; PREINDEX-NEXT: // %bb.2: // %for.exit +; PREINDEX-NEXT: ret +; +; POSTINDEX-LABEL: mixed_offsets_scalable_then_fixed: +; POSTINDEX: // %bb.0: // %entry +; POSTINDEX-NEXT: ptrue p0.s +; POSTINDEX-NEXT: mov x8, xzr +; POSTINDEX-NEXT: addvl x9, x0, #4 +; POSTINDEX-NEXT: mov x10, #8 // =0x8 +; POSTINDEX-NEXT: .LBB3_1: // %for.body +; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x9, #-4, mul vl] +; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x9] +; POSTINDEX-NEXT: ld1w { z2.s }, p0/z, [x9, x10, lsl #2] +; POSTINDEX-NEXT: addvl x9, x9, #1 +; POSTINDEX-NEXT: add z0.s, z0.s, z1.s +; POSTINDEX-NEXT: add z0.s, z0.s, z2.s +; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; POSTINDEX-NEXT: incw x8 +; POSTINDEX-NEXT: cmp x2, x8 +; POSTINDEX-NEXT: b.ne .LBB3_1 +; POSTINDEX-NEXT: // %bb.2: // %for.exit +; POSTINDEX-NEXT: ret entry: %vscale = tail call i64 @llvm.vscale.i64() %mul = shl nuw nsw i64 %vscale, 4 @@ -295,25 +297,66 @@ for.exit: ;; base (but in range of the other). ;; define void @three_access_wide_gap(ptr %src, ptr %dst, i64 %count) #0 { -; COMMON-LABEL: three_access_wide_gap: -; COMMON: // %bb.0: // %entry -; COMMON-NEXT: ptrue p0.s -; COMMON-NEXT: mov x8, xzr -; COMMON-NEXT: addvl x9, x0, #8 -; COMMON-NEXT: addvl x10, x0, #4 -; COMMON-NEXT: .LBB5_1: // %for.body -; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 -; COMMON-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; COMMON-NEXT: ld1w { z1.s }, p0/z, [x10, x8, lsl #2] -; COMMON-NEXT: ld1w { z2.s }, p0/z, [x9, x8, lsl #2] -; COMMON-NEXT: add z0.s, z0.s, z1.s -; COMMON-NEXT: add z0.s, z0.s, z2.s -; COMMON-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; COMMON-NEXT: incw x8 -; COMMON-NEXT: cmp x2, x8 -; COMMON-NEXT: b.ne .LBB5_1 -; COMMON-NEXT: // %bb.2: // %for.exit -; COMMON-NEXT: ret +; BASE-LABEL: three_access_wide_gap: +; BASE: // %bb.0: // %entry +; BASE-NEXT: ptrue p0.s +; BASE-NEXT: rdvl x8, #8 +; BASE-NEXT: ptrue p1.b +; BASE-NEXT: .LBB5_1: // %for.body +; BASE-NEXT: // =>This Inner Loop Header: Depth=1 +; BASE-NEXT: ld1w { z0.s }, p0/z, [x0] +; BASE-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; BASE-NEXT: decw x2 +; BASE-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; BASE-NEXT: addvl x0, x0, #1 +; BASE-NEXT: add z0.s, z0.s, z1.s +; BASE-NEXT: add z0.s, z0.s, z2.s +; BASE-NEXT: st1w { z0.s }, p0, [x1] +; BASE-NEXT: addvl x1, x1, #1 +; BASE-NEXT: cbnz x2, .LBB5_1 +; BASE-NEXT: // %bb.2: // %for.exit +; BASE-NEXT: ret +; +; PREINDEX-LABEL: three_access_wide_gap: +; PREINDEX: // %bb.0: // %entry +; PREINDEX-NEXT: ptrue p0.s +; PREINDEX-NEXT: rdvl x8, #8 +; PREINDEX-NEXT: ptrue p1.b +; PREINDEX-NEXT: .LBB5_1: // %for.body +; PREINDEX-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] +; PREINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; PREINDEX-NEXT: decw x2 +; PREINDEX-NEXT: ld1b { z2.b }, p1/z, [x0, x8] +; PREINDEX-NEXT: addvl x0, x0, #1 +; PREINDEX-NEXT: add z0.s, z0.s, z1.s +; PREINDEX-NEXT: add z0.s, z0.s, z2.s +; PREINDEX-NEXT: st1w { z0.s }, p0, [x1] +; PREINDEX-NEXT: addvl x1, x1, #1 +; PREINDEX-NEXT: cbnz x2, .LBB5_1 +; PREINDEX-NEXT: // %bb.2: // %for.exit +; PREINDEX-NEXT: ret +; +; POSTINDEX-LABEL: three_access_wide_gap: +; POSTINDEX: // %bb.0: // %entry +; POSTINDEX-NEXT: ptrue p0.s +; POSTINDEX-NEXT: mov x8, xzr +; POSTINDEX-NEXT: rdvl x9, #8 +; POSTINDEX-NEXT: ptrue p1.b +; POSTINDEX-NEXT: .LBB5_1: // %for.body +; POSTINDEX-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEX-NEXT: ld1w { z0.s }, p0/z, [x0] +; POSTINDEX-NEXT: ld1w { z1.s }, p0/z, [x0, #4, mul vl] +; POSTINDEX-NEXT: ld1b { z2.b }, p1/z, [x0, x9] +; POSTINDEX-NEXT: addvl x0, x0, #1 +; POSTINDEX-NEXT: add z0.s, z0.s, z1.s +; POSTINDEX-NEXT: add z0.s, z0.s, z2.s +; POSTINDEX-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; POSTINDEX-NEXT: incw x8 +; POSTINDEX-NEXT: cmp x2, x8 +; POSTINDEX-NEXT: b.ne .LBB5_1 +; POSTINDEX-NEXT: // %bb.2: // %for.exit +; POSTINDEX-NEXT: ret entry: %vscale = tail call i64 @llvm.vscale.i64() %mul = mul nuw nsw i64 %vscale, 16 From f5e855534faafb6ca48a17576ac54d3b3603af6f Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 18 Jun 2024 09:44:06 +0000 Subject: [PATCH 4/4] Add isFixed, getZero, getScalableFixedMin/Max convenience methods --- llvm/include/llvm/Support/TypeSize.h | 3 + .../Transforms/Scalar/LoopStrengthReduce.cpp | 259 +++++++++--------- 2 files changed, 140 insertions(+), 122 deletions(-) diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index c6779e258be7c..bae833ecca7d4 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -170,6 +170,9 @@ template class FixedOrScalableQuantity { /// Returns whether the quantity is scaled by a runtime quantity (vscale). constexpr bool isScalable() const { return Scalable; } + /// Returns true if the quantity is not scaled by vscale. + constexpr bool isFixed() const { return !Scalable; } + /// A return value of true indicates we know at compile time that the number /// of elements (vscale * Min) is definitely even. However, returning false /// does not guarantee that the total number of elements is odd. diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 6fd1685a8fa76..810c6b68032fa 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -265,16 +265,29 @@ class Immediate : public details::FixedOrScalableQuantity { : FixedOrScalableQuantity(V) {} public: - constexpr Immediate() : FixedOrScalableQuantity() {} + constexpr Immediate() = delete; static constexpr Immediate getFixed(ScalarTy MinVal) { - return Immediate(MinVal, false); + return {MinVal, false}; } static constexpr Immediate getScalable(ScalarTy MinVal) { - return Immediate(MinVal, true); + return {MinVal, true}; } static constexpr Immediate get(ScalarTy MinVal, bool Scalable) { - return Immediate(MinVal, Scalable); + return {MinVal, Scalable}; + } + static constexpr Immediate getZero() { return {0, false}; } + static constexpr Immediate getFixedMin() { + return {std::numeric_limits::min(), false}; + } + static constexpr Immediate getFixedMax() { + return {std::numeric_limits::max(), false}; + } + static constexpr Immediate getScalableMin() { + return {std::numeric_limits::min(), true}; + } + static constexpr Immediate getScalableMax() { + return {std::numeric_limits::max(), true}; } constexpr bool isLessThanZero() const { return Quantity < 0; } @@ -292,6 +305,47 @@ class Immediate : public details::FixedOrScalableQuantity { constexpr bool isMax() const { return Quantity == std::numeric_limits::max(); } + + // Arithmetic 'operators' that cast to unsigned types first. + constexpr Immediate addUnsigned(const Immediate &RHS) const { + assert(isCompatibleImmediate(RHS) && "Incompatible Immediates"); + ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue(); + return {Value, Scalable || RHS.isScalable()}; + } + + constexpr Immediate subUnsigned(const Immediate &RHS) const { + assert(isCompatibleImmediate(RHS) && "Incompatible Immediates"); + ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue(); + return {Value, Scalable || RHS.isScalable()}; + } + + // Scale the quantity by a constant without caring about runtime scalability. + constexpr Immediate mulUnsigned(const ScalarTy RHS) const { + ScalarTy Value = (uint64_t)Quantity * RHS; + return {Value, Scalable}; + } + + // Helpers for generating SCEVs with vscale terms where needed. + const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const { + const SCEV *S = SE.getConstant(Ty, Quantity); + if (Scalable) + S = SE.getMulExpr(S, SE.getVScale(S->getType())); + return S; + } + + const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const { + const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity); + if (Scalable) + NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType())); + return NegS; + } + + const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const { + const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity)); + if (Scalable) + SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); + return SU; + } }; // This is needed for the Compare type of std::map when Immediate is used @@ -431,7 +485,7 @@ struct Formula { GlobalValue *BaseGV = nullptr; /// Base offset for complex addressing. - Immediate BaseOffset; + Immediate BaseOffset = Immediate::getZero(); /// Whether any complex addressing has a base register. bool HasBaseReg = false; @@ -462,7 +516,7 @@ struct Formula { /// An additional constant offset which added near the use. This requires a /// temporary register, but the offset itself can live in an add immediate /// field rather than a register. - Immediate UnfoldedOffset; + Immediate UnfoldedOffset = Immediate::getZero(); Formula() = default; @@ -899,7 +953,7 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { S = SE.getConstant(M->getType(), 0); return Immediate::getScalable(C->getValue()->getSExtValue()); } - return Immediate(); + return Immediate::getZero(); } /// If S involves the addition of a GlobalValue address, return that symbol, and @@ -1214,7 +1268,7 @@ struct LSRFixup { /// A constant offset to be added to the LSRUse expression. This allows /// multiple fixups to share the same LSRUse with different offsets, for /// example in an unrolled loop. - Immediate Offset; + Immediate Offset = Immediate::getZero(); LSRFixup() = default; @@ -1277,10 +1331,8 @@ class LSRUse { SmallVector Fixups; /// Keep track of the min and max offsets of the fixups. - Immediate MinOffset = - Immediate::getFixed(std::numeric_limits::max()); - Immediate MaxOffset = - Immediate::getFixed(std::numeric_limits::min()); + Immediate MinOffset = Immediate::getFixedMax(); + Immediate MaxOffset = Immediate::getFixedMin(); /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. @@ -1338,8 +1390,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, - Instruction *Fixup = nullptr, - int64_t ScalableOffset = 0); + Instruction *Fixup = nullptr); static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) { if (isa(Reg) || isa(Reg)) @@ -1391,7 +1442,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, // If the step size matches the base offset, we could use pre-indexed // addressing. - if (AMK == TTI::AMK_PreIndexed && !F.BaseOffset.isScalable()) { + if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) { if (auto *Step = dyn_cast(AR->getStepRecurrence(*SE))) if (Step->getAPInt() == F.BaseOffset.getFixedValue()) LoopCost = 0; @@ -1491,25 +1542,25 @@ void Cost::RateFormula(const Formula &F, // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { - // FIXME: We probably want to noticeably increase the cost if the - // two offsets differ in scalability? - bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable(); - int64_t O = Fixup.Offset.getKnownMinValue(); - Immediate Offset = Immediate::get( - (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable); - if (F.BaseGV) - C.ImmCost += 64; // Handle symbolic values conservatively. - // TODO: This should probably be the pointer size. - else if (Offset.isNonZero()) - C.ImmCost += - APInt(64, Offset.getKnownMinValue(), true).getSignificantBits(); - - // Check with target if this offset with this instruction is - // specifically not supported. - if (LU.Kind == LSRUse::Address && Offset.isNonZero() && - !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, - Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) - C.NumBaseAdds++; + if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) { + Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset); + if (F.BaseGV) + C.ImmCost += 64; // Handle symbolic values conservatively. + // TODO: This should probably be the pointer size. + else if (Offset.isNonZero()) + C.ImmCost += + APInt(64, Offset.getKnownMinValue(), true).getSignificantBits(); + + // Check with target if this offset with this instruction is + // specifically not supported. + if (LU.Kind == LSRUse::Address && Offset.isNonZero() && + !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, + Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) + C.NumBaseAdds++; + } else { + // Incompatible immediate type, increase cost to avoid using + C.ImmCost += 2048; + } } // If we don't count instruction cost exit here. @@ -1763,8 +1814,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, - Instruction *Fixup /* = nullptr */, - int64_t ScalableOffset) { + Instruction *Fixup /* = nullptr */) { switch (Kind) { case LSRUse::Address: { int64_t FixedOffset = @@ -1778,7 +1828,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. - if (BaseGV || ScalableOffset != 0) + if (BaseGV) return false; // ICmp only has two operands; don't allow more than two non-trivial parts. @@ -1961,7 +2011,7 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, - bool HasBaseReg, int64_t ScalableOffset = 0) { + bool HasBaseReg) { // Fast-path: zero is always foldable. if (BaseOffset.isZero() && !BaseGV) return true; @@ -1987,7 +2037,7 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, Scale = 0; return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, - HasBaseReg, Scale, nullptr, ScalableOffset); + HasBaseReg, Scale); } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, @@ -3304,14 +3354,13 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast(IncExpr); - int64_t IncOffset = 0; - int64_t ScalableOffset = 0; + Immediate IncOffset = Immediate::getZero(); if (IncConst) { if (IncConst && IncConst->getAPInt().getSignificantBits() > 64) return false; - IncOffset = IncConst->getValue()->getSExtValue(); + IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue()); } else { - // Look for mul(vscale, constant), to detect ScalableOffset. + // Look for mul(vscale, constant), to detect a scalable offset. auto *IncVScale = dyn_cast(IncExpr); if (!IncVScale || IncVScale->getNumOperands() != 2 || !isa(IncVScale->getOperand(1))) @@ -3319,7 +3368,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, auto *Scale = dyn_cast(IncVScale->getOperand(0)); if (!Scale || Scale->getType()->getScalarSizeInBits() > 64) return false; - ScalableOffset = Scale->getValue()->getSExtValue(); + IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue()); } if (!isAddressUse(TTI, UserInst, Operand)) @@ -3327,8 +3376,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, - Immediate::getFixed(IncOffset), /*HasBaseReg=*/false, - ScalableOffset)) + IncOffset, /*HasBaseReg=*/false)) return false; return true; @@ -3911,6 +3959,9 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, continue; Formula F = Base; + if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable()) + continue; + // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast(InnerSum); if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && @@ -4026,7 +4077,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // If we have an unfolded offset, generate a formula combining it with the // registers collected. - if (NewBase.UnfoldedOffset) { + if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) { assert(CombinedIntegerType && "Missing a type for the unfolded offset"); Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset.getFixedValue(), true)); @@ -4074,21 +4125,13 @@ void LSRInstance::GenerateConstantOffsetsImpl( auto GenerateOffset = [&](const SCEV *G, Immediate Offset) { Formula F = Base; - if (Base.BaseOffset.isScalable() != Offset.isScalable() && - Base.BaseOffset.isNonZero() && Offset.isNonZero()) + if (!Base.BaseOffset.isCompatibleImmediate(Offset)) return; - bool Scalable = Base.BaseOffset.isScalable() || Offset.isScalable(); - F.BaseOffset = Immediate::get((uint64_t)Base.BaseOffset.getKnownMinValue() - - Offset.getKnownMinValue(), - Scalable); + F.BaseOffset = Base.BaseOffset.subUnsigned(Offset); if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. - const SCEV *NewOffset = - SE.getConstant(G->getType(), Offset.getKnownMinValue()); - if (Scalable) - NewOffset = - SE.getMulExpr(NewOffset, SE.getVScale(NewOffset->getType())); + const SCEV *NewOffset = Offset.getSCEV(SE, G->getType()); const SCEV *NewG = SE.getAddExpr(NewOffset, G); // If it cancelled out, drop the base register, otherwise update it. if (NewG->isZero()) { @@ -4126,7 +4169,7 @@ void LSRInstance::GenerateConstantOffsetsImpl( StepInt.getSExtValue() : StepInt.getZExtValue(); for (Immediate Offset : Worklist) { - if (!Offset.isScalable()) { + if (Offset.isFixed()) { Offset = Immediate::getFixed(Offset.getFixedValue() - Step); GenerateOffset(G, Offset); } @@ -4139,12 +4182,10 @@ void LSRInstance::GenerateConstantOffsetsImpl( Immediate Imm = ExtractImmediate(G, SE); if (G->isZero() || Imm.isZero() || - Base.BaseOffset.isScalable() != Imm.isScalable()) + !Base.BaseOffset.isCompatibleImmediate(Imm)) return; Formula F = Base; - F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + - Imm.getKnownMinValue(), - Imm.isScalable()); + F.BaseOffset = F.BaseOffset.addUnsigned(Imm); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) return; if (IsScaledReg) { @@ -4205,8 +4246,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Check that the multiplication doesn't overflow. if (Base.BaseOffset.isMin() && Factor == -1) continue; - Immediate NewBaseOffset = - Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor); + // Not supporting scalable immediates. + if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable()) + continue; + Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor); assert(Factor != 0 && "Zero factor not expected!"); if (NewBaseOffset.getFixedValue() / Factor != Base.BaseOffset.getFixedValue()) @@ -4220,7 +4263,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Immediate Offset = LU.MinOffset; if (Offset.isMin() && Factor == -1) continue; - Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor); + Offset = Offset.mulUnsigned(Factor); if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. @@ -4236,9 +4279,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; // Compensate for the use having MinOffset built into it. - F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + - Offset.getFixedValue() - - LU.MinOffset.getFixedValue()); + F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset); const SCEV *FactorS = SE.getConstant(IntTy, Factor); @@ -4260,8 +4301,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (F.UnfoldedOffset.isNonZero()) { if (F.UnfoldedOffset.isMin() && Factor == -1) continue; - F.UnfoldedOffset = Immediate::getFixed( - (uint64_t)F.UnfoldedOffset.getFixedValue() * Factor); + F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor); if (F.UnfoldedOffset.getFixedValue() / Factor != Base.UnfoldedOffset.getFixedValue()) continue; @@ -4517,8 +4557,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // other orig regs. Immediate First = Imms.begin()->first; Immediate Last = std::prev(Imms.end())->first; - if (First.isScalable() != Last.isScalable() && First.isNonZero() && - Last.isNonZero()) { + if (!First.isCompatibleImmediate(Last)) { LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << "\n"); continue; @@ -4539,15 +4578,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Imms.lower_bound(Immediate::get(Avg, Scalable))}; for (const auto &M : OtherImms) { if (M == J || M == JE) continue; - if (JImm.isScalable() != M->first.isScalable() && JImm.isNonZero() && - M->first.isNonZero()) + if (!JImm.isCompatibleImmediate(M->first)) continue; // Compute the difference between the two. - bool Scalable = JImm.isScalable() || M->first.isScalable(); - Immediate Imm = Immediate::get((uint64_t)JImm.getKnownMinValue() - - M->first.getKnownMinValue(), - Scalable); + Immediate Imm = JImm.subUnsigned(M->first); for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) @@ -4569,10 +4604,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { const SCEV *OrigReg = WI.OrigReg; Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); - const SCEV *NegImmS = - SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getKnownMinValue())); - if (Imm.isScalable()) - NegImmS = SE.getMulExpr(NegImmS, SE.getVScale(NegImmS->getType())); + const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); // TODO: Use a more targeted data structure. @@ -4585,19 +4617,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { - if (F.BaseOffset.isScalable() != Imm.isScalable() && - F.BaseOffset.isNonZero() && Imm.isNonZero()) + if (!F.BaseOffset.isCompatibleImmediate(Imm)) continue; - bool Scalable = F.BaseOffset.isScalable() || Imm.isScalable(); - Immediate Offset = - Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + - Imm.getKnownMinValue() * (uint64_t)F.Scale, - Scalable); + Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale)); // Don't create 50 + reg(-50). - const SCEV *S = SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offset.getKnownMinValue())); - if (Scalable) - S = SE.getMulExpr(S, SE.getVScale(S->getType())); + const SCEV *S = Offset.getNegativeSCEV(SE, IntTy); if (F.referencesReg(S)) continue; Formula NewF = F; @@ -4610,12 +4634,18 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // If the new scale is a constant in a register, and adding the constant // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. - if (const SCEVConstant *C = dyn_cast(NewF.ScaledReg)) + if (const SCEVConstant *C = dyn_cast(NewF.ScaledReg)) { + // FIXME: Do we need to do something for scalable immediates here? + // A scalable SCEV won't be constant, but we might still have + // something in the offset? Bail out for now to be safe. + if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable()) + continue; if (C->getValue()->isNegative() != (NewF.BaseOffset.isLessThanZero()) && (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) .ule(std::abs(NewF.BaseOffset.getFixedValue()))) continue; + } // OK, looks good. NewF.canonicalize(*this->L); @@ -4631,21 +4661,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) || !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset)) continue; - bool Scalable = NewF.BaseOffset.isScalable() || Imm.isScalable() || - NewF.UnfoldedOffset.isScalable(); - NewF.BaseOffset = - Immediate::get((uint64_t)NewF.BaseOffset.getKnownMinValue() + - Imm.getKnownMinValue(), - Scalable); + NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) continue; - Immediate NewUnfoldedOffset = Immediate::get( - (uint64_t)NewF.UnfoldedOffset.getKnownMinValue() + - Imm.getKnownMinValue(), - Scalable); + Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm); if (!isLegalAddImmediate(TTI, NewUnfoldedOffset)) continue; NewF = F; @@ -4657,7 +4679,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // constant value to the immediate would produce a value closer to // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) - if (const SCEVConstant *C = dyn_cast(NewReg)) + if (const SCEVConstant *C = dyn_cast(NewReg)) { + if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable()) + goto skip_formula; if ((C->getAPInt() + NewF.BaseOffset.getFixedValue()) .abs() .slt(std::abs(NewF.BaseOffset.getFixedValue())) && @@ -4666,6 +4690,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { (unsigned)llvm::countr_zero( NewF.BaseOffset.getFixedValue())) goto skip_formula; + } // Ok, looks good. NewF.canonicalize(*this->L); @@ -4849,6 +4874,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { bool Any = false; for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { Formula &F = LU.Formulae[i]; + if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable()) + continue; // Look for a formula with a constant or GV in a register. If the use // also has a formula with that same value in an immediate field, // delete the one that uses a register. @@ -5754,14 +5781,10 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail // out at this point, or should we generate a SCEV adding together mixed // offsets? - assert((F.BaseOffset.isScalable() == LF.Offset.isScalable() || - F.BaseOffset.isZero() || LF.Offset.isZero()) && + assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) && "Expanding mismatched offsets\n"); - bool Scalable = F.BaseOffset.isScalable() || LF.Offset.isScalable(); // Expand the immediate portion. - Immediate Offset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + - LF.Offset.getKnownMinValue(), - Scalable); + Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset); if (Offset.isNonZero()) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a @@ -5776,23 +5799,15 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, } else { // Just add the immediate values. These again are expected to be matched // as part of the address. - const SCEV *SU = SE.getUnknown( - ConstantInt::getSigned(IntTy, Offset.getKnownMinValue())); - if (Scalable) - SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); - Ops.push_back(SU); + Ops.push_back(Offset.getUnknownSCEV(SE, IntTy)); } } // Expand the unfolded offset portion. Immediate UnfoldedOffset = F.UnfoldedOffset; if (UnfoldedOffset.isNonZero()) { - const SCEV *SU = SE.getUnknown( - ConstantInt::getSigned(IntTy, UnfoldedOffset.getKnownMinValue())); - if (UnfoldedOffset.isScalable()) - SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); // Just add the immediate values. - Ops.push_back(SU); + Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy)); } // Emit instructions summing all the operands.