-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b. #78637
Conversation
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
@llvm/pr-subscribers-backend-aarch64 Author: Florian Hahn (fhahn) ChangesImprove codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790 Full diff: https://github.com/llvm/llvm-project/pull/78637.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a6f1dc7487bae..4be78f61fe7b65 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21318,6 +21318,53 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
}
+// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
+static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ SDValue Value = ST->getValue();
+ EVT ValueVT = Value.getValueType();
+
+ if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
+ ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE ||
+ ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
+ return SDValue();
+
+ SDLoc DL(ST);
+ auto WideVT = EVT::getVectorVT(
+ *DAG.getContext(),
+ Value->getOperand(0).getValueType().getVectorElementType(), 4);
+ SDValue UndefVector = DAG.getUNDEF(WideVT);
+ SDValue WideTrunc = DAG.getNode(
+ ISD::INSERT_SUBVECTOR, DL, WideVT,
+ {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
+ SDValue Cast = DAG.getNode(
+ ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
+ WideTrunc);
+
+ SDValue Chain = ST->getChain();
+ SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
+ DAG.getConstant(8, DL, MVT::i64));
+
+ SDValue Ptr2 =
+ DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(2), DL);
+ Chain = DAG.getStore(Chain, DL, E2, Ptr2, ST->getPointerInfo(),
+ ST->getOriginalAlign());
+
+ SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
+ DAG.getConstant(4, DL, MVT::i64));
+
+ SDValue Ptr1 =
+ DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(1), DL);
+ Chain = DAG.getStore(Chain, DL, E1, Ptr1, ST->getPointerInfo(),
+ ST->getOriginalAlign());
+ SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
+ DAG.getConstant(0, DL, MVT::i64));
+ Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getPointerInfo(),
+ ST->getOriginalAlign());
+
+ return Chain;
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -21333,6 +21380,9 @@ static SDValue performSTORECombine(SDNode *N,
return EltVT == MVT::f32 || EltVT == MVT::f64;
};
+ if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
+ return Res;
+
// If this is an FP_ROUND followed by a store, fold this into a truncating
// store. We can do this even if this is already a truncstore.
// We purposefully don't care about legality of the nodes here as we know
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 9eeb194409df6f..60639ea91fbaa1 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -154,17 +154,12 @@ define <3 x i32> @load_v3i32(ptr %src) {
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: store_trunc_from_64bits:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: ldrh w8, [x0, #4]
-; CHECK-NEXT: mov.h v0[2], w8
-; CHECK-NEXT: xtn.8b v0, v0
-; CHECK-NEXT: str s0, [sp, #12]
-; CHECK-NEXT: ldrh w9, [sp, #12]
-; CHECK-NEXT: strb w8, [x1, #2]
-; CHECK-NEXT: strh w9, [x1]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: add x8, x0, #4
+; CHECK-NEXT: ld1r.4h { v0 }, [x8]
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: strb w8, [x1]
+; CHECK-NEXT: add x8, x1, #1
+; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: ret
;
; BE-LABEL: store_trunc_from_64bits:
@@ -236,17 +231,13 @@ entry:
define void @shift_trunc_store(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store:
; CHECK: ; %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: shrn.4h v0, v0, #16
-; CHECK-NEXT: xtn.8b v1, v0
-; CHECK-NEXT: umov.h w8, v0[2]
-; CHECK-NEXT: str s1, [sp, #12]
-; CHECK-NEXT: ldrh w9, [sp, #12]
-; CHECK-NEXT: strb w8, [x1, #2]
-; CHECK-NEXT: strh w9, [x1]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: add x8, x1, #1
+; CHECK-NEXT: add x9, x1, #2
+; CHECK-NEXT: ushr.4s v0, v0, #16
+; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: st1.b { v0 }[8], [x9]
+; CHECK-NEXT: st1.b { v0 }[0], [x1]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store:
|
; CHECK-NEXT: add sp, sp, #16 | ||
; CHECK-NEXT: add x8, x0, #4 | ||
; CHECK-NEXT: ld1r.4h { v0 }, [x8] | ||
; CHECK-NEXT: ldr w8, [x0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't really doing a good job of demonstrating what you're trying to do here... maybe add a testcase with some arithmetic, so the store and the load can't be combined together?
Also, this looks like it's getting miscompiled; it's only storing two bytes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't really doing a good job of demonstrating what you're trying to do here... maybe add a testcase with some arithmetic, so the store and the load can't be combined together?
The store and load have different addresses, so it shouldn't be possible to combine them. Is there another combining opportunity I am missing?
Also, this looks like it's getting miscompiled; it's only storing two bytes.
Yes, this was using incorrect extract indices; should be fixed now, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The store and load have different addresses, so it shouldn't be possible to combine them
I mean, the load involves some INSERT_VECTOR_ELT, and the store involves some EXTRACT_VECTOR_ELT, and the two are getting combined together, so it's not a "pure" store sequence; one of the stores is actually from an integer register. If the combiners get smarter, you might end up not using vector ops at all.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, thanks! Added additional variants in e7b4ff8 and updated the PR
Update checks after adding more tests in e7b4ff8
EVT ValueVT = Value.getValueType(); | ||
|
||
if (ST->isVolatile() || !Subtarget->isLittleEndian() || | ||
ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is ST->getOriginalAlign() >= 4
protecting against? Increasing the known alignment of the pointer doesn't change the generated code, as far as I can tell.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this was only relevant for the load case I think. I will remove it (and also add tests with different alignments)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed the check and added now tests with different alignments; the alignment impacts load codegen only.
; CHECK-NEXT: xtn.8b v1, v0 | ||
; CHECK-NEXT: umov.h w8, v0[2] | ||
; CHECK-NEXT: str s1, [sp, #12] | ||
; CHECK-NEXT: ldrh w9, [sp, #12] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Spent a little time looking at why the default code is so horrible; the primary issue is actually the way the legalizer (GenWidenVectorStores) is trying to lower the operation into an i16 store followed by an i8 store. It ends up generating a bitcast from v4i8 to v2i16, and the default handling for that is completely terrible (it doesn't know how to use a shuffle, so it goes through a stack temporary).
Maybe worth looking into improving the bitcast situation if you're going to continue looking at very narrow vector types.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, I am planning to look into this as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I am not entirely sure what the exact issue is; the bit cast itself should be a no-op. Perhaps the issue is that the vector types aren't legal?
It seems like we are going from v3i8 -> v4i8, which isn't legal, and then going from v4i8 -> v4i16 which is legal. Perhaps it would be better to go from v3i8 to v8i8, but I couldn't find a way to do that; setting custom actions require from & to types to by MVTs, but v3i8 isn't a simple value type unfortunately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
v4i8 and v2i16 aren't legal types. And the legalization rule we use for them is "promote", so the bitcast isn't a no-op. There's padding between the elements. So we need to do a shuffle... but neither AArch64TargetLowering::ReplaceBITCASTResults nor DAGTypeLegalizer::PromoteIntRes_BITCAST has code to do that, so we end up using CreateStackStoreLoad.
The legalization rule for vector types is under the control of the target; see AArch64TargetLowering::getPreferredVectorAction. You can use that to force v4i8->v8i8. But changing that impacts a lot of code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I'll look into that separately.
ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8, | ||
WideTrunc); | ||
|
||
unsigned IdxScale = WideVT.getScalarSizeInBits() / 8; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of writing this out, can we just use TargetLowering::scalarizeVectorStore? I think it does roughly the same thing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure, I tried but the code here combines the trunc with the stores; it looks likescalarizeVectorStore
would generate a 16 bit store and a 8 bit store, and we first need to do the cast separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it doesn't work, that's fine
Add extra tests with different load/store alignments for #78637.
Updated, comments should be addressed and also updated to use |
|
||
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, | ||
DAG.getConstant(0, DL, MVT::i64)); | ||
Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getMemOperand()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing getMachineMemOperand call here? (The MachineMemOperand for the original store has the wrong size.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Extra tests for llvm#78637 llvm#78632 (cherry-picked from ff1cde5)
Extra tests for llvm#78637 llvm#78632 (cherry-picked from e7b4ff8)
Add extra tests with different load/store alignments for llvm#78637. (cherry-picked from 98509c7)
…lvm#78637) Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 PR: llvm#78637 (cherry-picked from eb678d8)
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them.
At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790