-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Add custom lowering for load <3 x i8>. #78632
Changes from 11 commits
a786cde
192233f
04bd1e5
d35da6a
39d6794
748f706
e96af2f
9800b2c
7e2bf68
109038b
e6d5725
491f56d
fac6324
ebb84fc
c1013f8
445d9be
ca48e78
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11012,6 +11012,50 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { | |
MaskSourceVec); | ||
} | ||
|
||
// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to | ||
// insert into a vector and use a shuffle. This improves lowering for loads of | ||
// <3 x i8>. | ||
static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) { | ||
if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16) | ||
return SDValue(); | ||
|
||
SDValue V0 = Op.getOperand(0); | ||
SDValue V1 = Op.getOperand(1); | ||
SDValue V2 = Op.getOperand(2); | ||
SDValue V3 = Op.getOperand(3); | ||
if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | ||
V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | ||
V2.getOpcode() != ISD::LOAD || | ||
!(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) | ||
return SDValue(); | ||
|
||
if (V0.getOperand(0) != V1.getOperand(0) || | ||
V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || | ||
!(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're not checking There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, added a check, thanks! |
||
return SDValue(); | ||
|
||
SDLoc dl(Op); | ||
auto *L = cast<LoadSDNode>(Op.getOperand(2)); | ||
auto Vec = V0.getOperand(0); | ||
|
||
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec, | ||
SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64)); | ||
Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||
|
||
SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)}; | ||
ShuffleOps[0] = Vec; | ||
|
||
SmallVector<int, 8> Mask(4, -1); | ||
Mask[0] = 0; | ||
Mask[1] = 1; | ||
Mask[2] = 2; | ||
if (!V3.isUndef()) | ||
Mask[3] = 3; | ||
SDValue Shuffle = | ||
DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask); | ||
return Shuffle; | ||
} | ||
|
||
// Gather data to see if the operation can be modelled as a | ||
// shuffle in combination with VEXTs. | ||
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, | ||
|
@@ -11022,6 +11066,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, | |
EVT VT = Op.getValueType(); | ||
assert(!VT.isScalableVector() && | ||
"Scalable vectors cannot be used with ISD::BUILD_VECTOR"); | ||
|
||
if (SDValue S = shuffleWithSingleLoad(Op, DAG)) | ||
return S; | ||
|
||
unsigned NumElts = VT.getVectorNumElements(); | ||
|
||
struct ShuffleSourceInfo { | ||
|
@@ -11048,6 +11096,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, | |
|
||
// First gather all vectors used as an immediate source for this BUILD_VECTOR | ||
// node. | ||
// | ||
SmallVector<ShuffleSourceInfo, 2> Sources; | ||
for (unsigned i = 0; i < NumElts; ++i) { | ||
SDValue V = Op.getOperand(i); | ||
|
@@ -21248,6 +21297,51 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { | |
return SDValue(); | ||
} | ||
|
||
// A custom combine to lower load <3 x i8> as the more efficient sequence | ||
// below: | ||
// ldrb wX, [x0, #2] | ||
// ldrh wY, [x0] | ||
// orr wX, wY, wX, lsl #16 | ||
// fmov s0, wX | ||
// | ||
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { | ||
EVT MemVT = LD->getMemoryVT(); | ||
if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) || | ||
LD->getOriginalAlign() >= 4) | ||
return SDValue(); | ||
|
||
SDLoc DL(LD); | ||
MachineFunction &MF = DAG.getMachineFunction(); | ||
SDValue Chain = LD->getChain(); | ||
SDValue BasePtr = LD->getBasePtr(); | ||
MachineMemOperand *MMO = LD->getMemOperand(); | ||
assert(LD->getOffset().isUndef() && "undef offset expected"); | ||
inclyc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Load 2 x i8, then 1 x i8. | ||
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, | ||
MF.getMachineMemOperand(MMO, 0, 2)); | ||
TypeSize Offset2 = TypeSize::getFixed(2); | ||
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain, | ||
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), | ||
MF.getMachineMemOperand(MMO, 2, 1)); | ||
|
||
SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16); | ||
|
||
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16); | ||
|
||
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); | ||
SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are these two doing? They ought to amount to a nop. |
||
|
||
SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast, | ||
Trunc8, DAG.getConstant(2, DL, MVT::i64)); | ||
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8, | ||
DAG.getConstant(0, DL, MVT::i64)); | ||
SDValue TokenFactor = DAG.getNode( | ||
ISD::TokenFactor, DL, MVT::Other, | ||
{SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)}); | ||
return DAG.getMergeValues({Extract, TokenFactor}, DL); | ||
} | ||
|
||
// Perform TBI simplification if supported by the target and try to break up | ||
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit | ||
// load instructions can be selected. | ||
|
@@ -21259,10 +21353,16 @@ static SDValue performLOADCombine(SDNode *N, | |
performTBISimplification(N->getOperand(1), DCI, DAG); | ||
|
||
LoadSDNode *LD = cast<LoadSDNode>(N); | ||
EVT MemVT = LD->getMemoryVT(); | ||
if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian()) | ||
if (LD->isVolatile() || !Subtarget->isLittleEndian()) | ||
return SDValue(N, 0); | ||
|
||
if (SDValue Res = combineV3I8LoadExt(LD, DAG)) | ||
return Res; | ||
|
||
if (!LD->isNonTemporal()) | ||
return SDValue(N, 0); | ||
|
||
EVT MemVT = LD->getMemoryVT(); | ||
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 || | ||
MemVT.getSizeInBits() % 256 == 0 || | ||
256 % MemVT.getScalarSizeInBits() != 0) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a hyper-specific pattern. I assume it's because we are specifically looking for and only care about a single
<3 x i8>
instruction (a load?) and this is what it's been mangled to by the time we get to see it. If so we might have to tolerate the horror, but should at least call it out in comments.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately yes! I couldn't find any alternative to prevent the folds that create the sub-optimal nodes. I slightly extended the comment at the top of the function. Do you think that's sufficient or should I also add one here?