Skip to content

Commit

Permalink
Try using LD1r.
Browse files Browse the repository at this point in the history
  • Loading branch information
fhahn committed Jan 25, 2024
1 parent 7e2bf68 commit 109038b
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 51 deletions.
68 changes: 57 additions & 11 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11012,6 +11012,48 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
MaskSourceVec);
}

// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to
// insert into a vector and use a shuffle. This improves lowering for loads of
// <3 x i8>.
static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) {
if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16)
return SDValue();

SDValue V0 = Op.getOperand(0);
SDValue V1 = Op.getOperand(1);
SDValue V2 = Op.getOperand(2);
SDValue V3 = Op.getOperand(3);
if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT))
return SDValue();

if (V0.getOperand(0) != V1.getOperand(0) ||
V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3))
return SDValue();

SDLoc dl(Op);
auto *L = cast<LoadSDNode>(Op.getOperand(2));
auto Vec = V0.getOperand(0);

Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec,
SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64));
Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec);

SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)};
ShuffleOps[0] = Vec;

SmallVector<int, 8> Mask(4, -1);
Mask[0] = 0;
Mask[1] = 1;
Mask[2] = 2;
if (!V3.isUndef())
Mask[3] = 3;
SDValue Shuffle =
DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask);
return Shuffle;
}

// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
Expand All @@ -11022,6 +11064,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
EVT VT = Op.getValueType();
assert(!VT.isScalableVector() &&
"Scalable vectors cannot be used with ISD::BUILD_VECTOR");

if (SDValue S = shuffleWithSingleLoad(Op, DAG))
return S;

unsigned NumElts = VT.getVectorNumElements();

struct ShuffleSourceInfo {
Expand All @@ -11048,6 +11094,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,

// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
//
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
Expand Down Expand Up @@ -21269,24 +21316,23 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
assert(LD->getOffset().isUndef() && "undef offset expected");

// Load 2 x i8, then 1 x i8.
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr,
MF.getMachineMemOperand(MMO, 0, 2));
TypeSize Offset2 = TypeSize::getFixed(2);
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
MF.getMachineMemOperand(MMO, 2, 1));

// Extend to i32.
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16);

// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
DAG.getConstant(16, DL, MVT::i32));
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16);

SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8);

// Extract v3i8 again.
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast,
Trunc8, DAG.getConstant(2, DL, MVT::i64));
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8,
DAG.getConstant(0, DL, MVT::i64));
SDValue TokenFactor = DAG.getNode(
ISD::TokenFactor, DL, MVT::Other,
Expand Down
58 changes: 18 additions & 40 deletions llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
define <16 x i8> @load_v3i8(ptr %src) {
; CHECK-LABEL: load_v3i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8:
Expand Down Expand Up @@ -38,12 +36,9 @@ define <16 x i8> @load_v3i8(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
Expand All @@ -59,7 +54,6 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #2]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
Expand All @@ -76,12 +70,9 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
Expand All @@ -97,7 +88,6 @@ define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #2]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
Expand Down Expand Up @@ -141,12 +131,11 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #3]
; CHECK-NEXT: ldurh w9, [x0, #1]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
; CHECK-NEXT: add x8, x0, #3
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
Expand All @@ -162,7 +151,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #3]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
Expand All @@ -180,12 +168,11 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #5]
; CHECK-NEXT: ldurh w9, [x0, #3]
; CHECK-NEXT: add x8, x0, #3
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
; CHECK-NEXT: add x8, x0, #5
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
Expand All @@ -201,7 +188,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #5]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
Expand Down Expand Up @@ -263,7 +249,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ldrsb w8, [x0, #2]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: mov.h v0[1], v0[1]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
Expand All @@ -281,7 +266,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
; BE-NEXT: ldrsb w8, [x0, #2]
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: mov v0.h[1], v0.h[1]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
Expand Down Expand Up @@ -410,12 +394,9 @@ entry:
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
Expand Down Expand Up @@ -507,16 +488,13 @@ entry:
define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_add_to_64bits:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldrb w9, [x0, #2]
; CHECK-NEXT: ldrh w10, [x0]
; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x8, lCPI13_0@PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr d1, [x8, lCPI13_0@PAGEOFF]
; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ld1.b { v0 }[2], [x0]
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: add.4h v0, v0, v1
; CHECK-NEXT: st1.h { v0 }[2], [x8]
Expand Down

0 comments on commit 109038b

Please sign in to comment.