-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] merge index address with large offset into base address #72187
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Allen (vfdff) ChangesA case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold Only Full diff: https://github.com/llvm/llvm-project/pull/72187.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6fdf5363bae2928..87d23a9e2935fae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4081,6 +4081,11 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
return MI.getOperand(Idx);
}
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+ return MI.getOperand(4);
+}
+
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
Register Reg) {
if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index a934103c90cbf92..08b568bb3484135 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
/// Returns the immediate offset operator of a load/store.
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
+ /// Returns the shift amount operator of a load/store.
+ static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
/// Returns whether the instruction is FP or NEON.
static bool isFpOrNEON(const MachineInstr &MI);
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 7add127e21e3e55..eeb24dfc72465bf 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
"Number of load/store from unscaled generated");
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+ "Number of const offset of index address folded");
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
"Controls which pairs are considered for renaming");
@@ -171,6 +173,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
int UnscaledOffset, unsigned Limit);
+ // Scan the instruction list to find a register assigned with a const
+ // value that can be combined with the current instruction (a load or store)
+ // using base addressing with writeback. Scan forwards.
+ MachineBasicBlock::iterator
+ findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+ unsigned &Offset);
+
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +191,18 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
unsigned BaseReg, int Offset);
+ bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+ unsigned IndexReg, unsigned &Offset);
+
// Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);
+ MachineBasicBlock::iterator
+ mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update, unsigned Offset);
+
// Find and merge zero store instructions.
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
@@ -199,6 +215,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
+ // Find and merge a index ldr/st instructions into a base ld/st instruction.
+ bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI);
+
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +500,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
}
}
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+ // TODO: Add more index address loads/stores.
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no base address equivalent!");
+ case AArch64::LDRBBroX:
+ return AArch64::LDRBBui;
+ }
+}
+
static unsigned getPostIndexedOpcode(unsigned Opc) {
switch (Opc) {
default:
@@ -722,6 +751,19 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
}
}
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default:
+ return false;
+ // Scaled instructions.
+ // TODO: Add more index address loads/stores.
+ case AArch64::LDRBBroX:
+ return true;
+ }
+}
+
static bool isRewritableImplicitDef(unsigned Opc) {
switch (Opc) {
default:
@@ -1925,6 +1967,62 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
return NextI;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update,
+ unsigned Offset) {
+ assert((Update->getOpcode() == AArch64::MOVKWi) &&
+ "Unexpected const mov instruction to merge!");
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+ MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+ MachineInstr &MemMI = *I;
+ unsigned Imm12 = Offset & 0x0fff;
+ unsigned High = Offset - Imm12;
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+ MachineInstrBuilder AddMIB, MemMIB;
+
+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+ AddMIB =
+ BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+ .addDef(IndexReg)
+ .addUse(BaseReg)
+ .addImm(High >> 12) // shifted value
+ .addImm(12); // shift 12
+ (void)AddMIB;
+ // Ld/St DestReg, IndexReg, Imm12
+ unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+ MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .add(getLdStRegOp(MemMI))
+ .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+ .addImm(Imm12)
+ .setMemRefs(I->memoperands())
+ .setMIFlags(I->mergeFlagsWith(*Update));
+ (void)MemMIB;
+
+ ++NumConstOffsetFolded;
+ LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+ LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
+ LLVM_DEBUG(PrevI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(Update->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " with instruction:\n ");
+ LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << " ");
+ LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions for the block.
+ I->eraseFromParent();
+ PrevI->eraseFromParent();
+ Update->eraseFromParent();
+
+ return NextI;
+}
+
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
MachineInstr &MI,
unsigned BaseReg, int Offset) {
@@ -1972,6 +2070,30 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
return false;
}
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+ MachineInstr &MI,
+ unsigned IndexReg,
+ unsigned &Offset) {
+ // The update instruction source and destination register must be the
+ // same as the load/store index register.
+ if (MI.getOpcode() == AArch64::MOVKWi &&
+ TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+ // movz + movk hold a large offset of a Ld/St instruction.
+ MachineBasicBlock::iterator B = MI.getParent()->begin();
+ MachineBasicBlock::iterator MBBI = &MI;
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MovzMI = *MBBI;
+ if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+ unsigned Low = MovzMI.getOperand(1).getImm();
+ unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+ Offset = High + Low;
+ return true;
+ }
+ }
+ return false;
+}
+
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2127,6 +2249,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+ MachineBasicBlock::iterator B = I->getParent()->begin();
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineInstr &MemMI = *I;
+ MachineBasicBlock::iterator MBBI = I;
+
+ // If the load is the first instruction in the block, there's obviously
+ // not any matching load or store.
+ if (MBBI == B)
+ return E;
+
+ // Make sure this the shift amount is zero.
+ // TODO: Relex this restriction to extend, simplify processing now.
+ if (!AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+ (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+ return E;
+
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+ // Track which register units have been modified and used between the first
+ // insn (inclusive) and the second insn.
+ ModifiedRegUnits.clear();
+ UsedRegUnits.clear();
+ unsigned Count = 0;
+ do {
+ MBBI = prev_nodbg(MBBI, B);
+ MachineInstr &MI = *MBBI;
+
+ // Don't count transient instructions towards the search limit since there
+ // may be different numbers of them if e.g. debug information is present.
+ if (!MI.isTransient())
+ ++Count;
+
+ // If we found a match, return it.
+ if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+ return MBBI;
+ }
+
+ // Update the status of what the instruction clobbered and used.
+ LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+ // Otherwise, if the base register is used or modified, we have no match, so
+ // return early.
+ if (!ModifiedRegUnits.available(BaseReg) ||
+ !UsedRegUnits.available(BaseReg))
+ return E;
+
+ } while (MBBI != B && Count < Limit);
+ return E;
+}
+
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@@ -2311,6 +2487,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
return false;
}
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator E = MI.getParent()->end();
+ MachineBasicBlock::iterator Update;
+
+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
+ if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+ return false;
+
+ // Look back to try to find a const offset for index LdSt instruction. For
+ // example,
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
+ // ldr x1, [x0, x8]
+ // merged into:
+ // add x8, x0, a * (1<<12)
+ // ldr x1, [x8, imm12]
+ unsigned Offset;
+ Update = findMatchingConstOffsetBackward(MBBI, UpdateLimit, Offset);
+ if (Update != E) {
+ // Merge the imm12 into the ld/st.
+ MBBI = mergeConstOffsetInsn(MBBI, Update, Offset);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool EnableNarrowZeroStOpt) {
@@ -2389,6 +2593,21 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
}
+ // 5) Find a register assigned with a const value that can be combined with
+ // into the load or store. e.g.,
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
+ // ldr x1, [x0, x8]
+ // ; becomes
+ // add x8, x0, a * (1<<12)
+ // ldr x1, [x8, imm12]
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ if (isMergeableIndexLdSt(*MBBI) && tryToMergeIndexLdSt(MBBI))
+ Modified = true;
+ else
+ ++MBBI;
+ }
+
return Modified;
}
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
new file mode 100644
index 000000000000000..719f666213c10e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -0,0 +1,32 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass aarch64-ldst-opt %s -o - | FileCheck %s
+--- |
+ define i32 @testOffset(ptr nocapture noundef readonly %a) {
+ entry:
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+ }
+
+...
+---
+name: testOffset
+liveins:
+ - { reg: '$x0', virtual-reg: '' }
+body: |
+ bb.0.entry:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: testOffset
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+ ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704 :: (load (s8) from %ir.arrayidx)
+ ; CHECK-NEXT: RET undef $lr, implicit $w0
+ renamable $w8 = MOVZWi 56952, 0
+ renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+ renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0 :: (load (s8) from %ir.arrayidx)
+ RET undef $lr, implicit $w0
+
+...
|
cda7796
to
bff3b81
Compare
This seems like it would be simpler to do at the point where we're computing the addressing mode in the first place (in ISelDAGToDAG). |
Thanks for your idea @efriedma-quic
|
bff3b81
to
4e685fc
Compare
hi @efriedma-quic, I refactor that in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be worth doing this as a (late) AArch64ISelLowering DAG combine so that the add gets shared between multiple load/store instructions.
// ADD BaseReg, WideImmediate & 0x0fff000 | ||
// LDR X2, [BaseReg, WideImmediate & 0x0fff] | ||
SDValue LHS = N.getOperand(0); | ||
if (isPreferredBaseAddrMode(RHSC)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there needs to be a check for Scale here too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I'll add the check for Scale
.
Yes, because the shifted value ImmOffLow >> Scale
is used as the offset of load/store, so check it is necessary
|
Could it share a similar method called from both performLOADCombine and performStoreCombine? I'm not sure if the opposite transform exists in DAG, but if it doesn't it should help only produce a single add. |
hi @davemgreen
so need more support in CodeGenPrepare::optimizeMemoryInst ? |
Ah I see. Like this: https://gcc.godbolt.org/z/4Ph36xdEv? It sounds like that might work, so long as the operands have multiple uses it shouldn't just transform it back to a single add. It might need something like the current patch to handle single-use cases, with something different for multiple-uses. |
4e685fc
to
42e4cf5
Compare
Thanks @davemgreen for your idea. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Add the 2nd patch to fix remainder |
49e5f3a
to
205255d
Compare
update to fix Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is probably best to split the CGP and DAG parts out into separate patches if we can. I'm seeing some failures in the test-suite, although the look like they might be post-isel so there might be something else going on. I'm still not 100% sure how to structure this without causing regressions in places.
; CHECK-NEXT: add w9, w9, #1 | ||
; CHECK-NEXT: str w9, [x8] | ||
; CHECK-NEXT: cmp w9, w1 | ||
; CHECK-NEXT: add x9, x0, #9, lsl #12 // =36864 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you know why this add is not being hoisted out of the loop? It could lead to some performance regressions if it is kept in. I think this same thing is happening in the TSVC benchmarks from the llvm-test-suite.
In this case the add is loop invariant. Could there be other cases where we split a non-loop-invariant add in a loop, leading to more instructions in the loop? The immediate could be moved out and kept in a register in the original version.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed this issue on PR74046.
// 12bits are same. | ||
int64_t AArch64TargetLowering::getPreferBaseOffset(int64_t MinOffset, | ||
int64_t MaxOffset) const { | ||
if (MinOffset >> 12 == MaxOffset >> 12 && MinOffset >> 24 == 0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is assuming a uimm12s1 addressing mode for the load/store? Should it be more precise about the type being used, and maybe make use of isLegalAddressingMode if it can?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks, I updated with API isLegalAddImmediate
@@ -1069,6 +1069,42 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSigned | |||
return true; | |||
} | |||
|
|||
// 16-bit optionally shifted immediates are legal for single mov. | |||
static bool isLegalSingleMOVImmediate(int64_t Immed) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this make use of expandMOVImm?
AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
if (Insn.size() != 1)
return;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apply your comment, thanks
hi @davemgreen, I spilt the 2nd patch into pr74046 according your idea, and I'll rebase this patch after that patch is merged because the |
rebase the 1st change after the PR74046 |
A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE ``` Fold mov w8, llvm#56952 movk w8, llvm#15, lsl llvm#16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] ``` Only support single use base, multi-use scenes are supported by PR74046. Fix llvm#71917 TODO: support the multiple-uses with reuseing common base offset. https://gcc.godbolt.org/z/Mr7srTjnz
I still seem to see some performance issues from this, in the TSVC benchmarks in the llvm test suite. Do you see the same thing? |
PS: add this patch base commit 57a11b7 |
It looks like it was running with I think it is in these loops that look like:
|
Thanks for your help, I can reproduce the regression now, and a simplified case s211 shows that.
|
I've wondered if this was worth adding, at least to check if the current block was in a loop. Otherwise we have had to move transforms to after ISel in order to check they are profitable. In this case I guess we need to know that the base is loop invariant? |
hi @efriedma-quic @efriedma-quic |
close because the new solution: #75343 |
A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
Only support single use base, multi-use scenes are supported by PR74046.
Fix #71917