-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Add custom lowering for load <3 x i8>. #78632
Conversation
Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
@llvm/pr-subscribers-backend-aarch64 Author: Florian Hahn (fhahn) ChangesAdd custom combine to lower load <3 x i8> as the more efficient sequence below: At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790 Full diff: https://github.com/llvm/llvm-project/pull/78632.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a6f1dc7487bae..e1139c2fede8e4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21095,6 +21095,50 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
return SDValue();
}
+// A custom combine to lower load <3 x i8> as the more efficient sequence
+// below:
+// ldrb wX, [x0, #2]
+// ldrh wY, [x0]
+// orr wX, wY, wX, lsl #16
+// fmov s0, wX
+//
+static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
+ EVT MemVT = LD->getMemoryVT();
+ if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
+ LD->getOriginalAlign() >= 4)
+ return SDValue();
+
+ SDLoc DL(LD);
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+
+ // Load 2 x i8, then 1 x i8.
+ SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
+ LD->getOriginalAlign());
+ SDValue L8 =
+ DAG.getLoad(MVT::i8, DL, Chain,
+ DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL),
+ LD->getPointerInfo(), LD->getOriginalAlign());
+
+ // Extend to i32.
+ SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
+ SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+
+ // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
+ SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
+ DAG.getConstant(16, DL, MVT::i32));
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
+
+ // Extract v3i8 again.
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue TokenFactor = DAG.getNode(
+ ISD::TokenFactor, DL, MVT::Other,
+ {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
+ return DAG.getMergeValues({Extract, TokenFactor}, DL);
+}
+
// Perform TBI simplification if supported by the target and try to break up
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
// load instructions can be selected.
@@ -21106,10 +21150,16 @@ static SDValue performLOADCombine(SDNode *N,
performTBISimplification(N->getOperand(1), DCI, DAG);
LoadSDNode *LD = cast<LoadSDNode>(N);
- EVT MemVT = LD->getMemoryVT();
- if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
+ if (LD->isVolatile() || !Subtarget->isLittleEndian())
+ return SDValue(N, 0);
+
+ if (SDValue Res = combineV3I8LoadExt(LD, DAG))
+ return Res;
+
+ if (!LD->isNonTemporal())
return SDValue(N, 0);
+ EVT MemVT = LD->getMemoryVT();
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
MemVT.getSizeInBits() % 256 == 0 ||
256 % MemVT.getScalarSizeInBits() != 0)
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 9eeb194409df6f..7cac4134f0e159 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -5,19 +5,10 @@
define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
; CHECK-LABEL: load_v3i8:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: umov.h w8, v0[0]
-; CHECK-NEXT: umov.h w9, v0[1]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: add x8, x0, #2
-; CHECK-NEXT: mov.b v0[1], w9
-; CHECK-NEXT: ld1.b { v0 }[2], [x8]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8:
@@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
; CHECK-LABEL: load_v3i8_to_4xi32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ldrsb w8, [x0, #2]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: mov.h v0[1], v0[1]
-; CHECK-NEXT: mov.h v0[2], w8
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32:
@@ -193,19 +179,15 @@ entry:
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: strh w8, [sp, #12]
-; CHECK-NEXT: add x8, x0, #2
-; CHECK-NEXT: ldr s0, [sp, #12]
-; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: ld1.b { v0 }[4], [x8]
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: add x8, x1, #4
+; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_ext_to_64bits:
|
Is there some reason to prefer that sequence over a shorter sequence, like a pair of ld1r followed by a zip1? I mean, I can imagine your sequence is faster on certain CPUs, but I'd want to document the reasoning. |
More variations:
|
Actually, I guess the following is the shortest, at 2 instructions:
|
Thanks, this is indeed more compact. I tried to massage the SelectionDAG nodes to generate it (7cc78c5) but it appears there are some cases where this results in slightly more code. I can check where those differences are coming from. In terms of overall cycles, both sequences should be mostly equivalent on the CPUs I checked. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM because I just did similar thing in our (unfortunately, closed source) backend. Let's wait for aarch64 code owners :) !
It looks like the INSERT_VECTOR_ELT is getting "optimized" into a BUILD_VECTOR, or something like that, instead of doing a shuffle like it does with your original sequence. |
Update checks after adding more tests in e7b4ff8
TypeSize Offset2 = TypeSize::getFixed(2); | ||
SDValue L8 = DAG.getLoad( | ||
MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), | ||
LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MachineFunction::getMachineMemOperand has an overload that takes an existing MachineMemOperand and adds an offset; that will produce a more accurate result here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks updated!
✅ With the latest revision this PR passed the C/C++ code formatter. |
@efriedma-quic ok I managed to track down where the issue is. The only workaround I could come up with is extending |
Extra tests for llvm#78637 llvm#78632 (cherry-picked from ff1cde5)
Extra tests for llvm#78637 llvm#78632 (cherry-picked from e7b4ff8)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a fan, but if we must then I think there might still be some gaps...
|
||
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec, | ||
SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64)); | ||
Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think Vec
could have quite a variety of unexpected types here (though running at a specific phase of DAG might limit that). There's no reason to expect it to have either 4 elements or for each element to be i16
just from what you've checked so far.
SDValue V1 = Op.getOperand(1); | ||
SDValue V2 = Op.getOperand(2); | ||
SDValue V3 = Op.getOperand(3); | ||
if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a hyper-specific pattern. I assume it's because we are specifically looking for and only care about a single <3 x i8>
instruction (a load?) and this is what it's been mangled to by the time we get to see it. If so we might have to tolerate the horror, but should at least call it out in comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately yes! I couldn't find any alternative to prevent the folds that create the sub-optimal nodes. I slightly extended the comment at the top of the function. Do you think that's sufficient or should I also add one here?
|
||
if (V0.getOperand(0) != V1.getOperand(0) || | ||
V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || | ||
!(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We're not checking V3.getOperand(0)
anywhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, added a check, thanks!
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); | ||
SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What are these two doing? They ought to amount to a nop.
This reverts commit 667b5c1fa2527c2fe756673ea2dad54eeecc3e82.
This reverts commit 109038b.
Thanks for taking a look @TNorthover! I tried to address the comments, but with them addressed it turned out to not really be feasible to go down that path. I changed the codegen back to use the slightly longer (but using instructions that are cheaper/less complex usually) below, with a comment about the alternative sequence using
|
I'm happier with that, I think. Just one typo I spotted in the new version but no need to reupload. |
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); | ||
|
||
// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. | ||
SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mismatch between name and operation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, adjusted! Planning to land this once the pre-commit checks pass.
Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 (cherry-picked from d1e162e)
Add custom combine to lower load <3 x i8> as the more efficient sequence below:
ldrb wX, [x0, #2]
ldrh wY, [x0]
orr wX, wY, wX, lsl #16
fmov s0, wX
At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790