From 109038bab1328d667a6e2eaf01acc82c33c95431 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 19 Jan 2024 16:40:46 +0000 Subject: [PATCH] Try using LD1r. --- .../Target/AArch64/AArch64ISelLowering.cpp | 68 ++++++++++++++++--- .../AArch64/vec3-loads-ext-trunc-stores.ll | 58 +++++----------- 2 files changed, 75 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 00d62b7450f3cb..6dc56ab3347a1e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11012,6 +11012,48 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { MaskSourceVec); } +// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to +// insert into a vector and use a shuffle. This improves lowering for loads of +// <3 x i8>. +static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) { + if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16) + return SDValue(); + + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + SDValue V2 = Op.getOperand(2); + SDValue V3 = Op.getOperand(3); + if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) + return SDValue(); + + if (V0.getOperand(0) != V1.getOperand(0) || + V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) + return SDValue(); + + SDLoc dl(Op); + auto *L = cast(Op.getOperand(2)); + auto Vec = V0.getOperand(0); + + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec, + SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64)); + Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec); + + SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)}; + ShuffleOps[0] = Vec; + + SmallVector Mask(4, -1); + Mask[0] = 0; + Mask[1] = 1; + Mask[2] = 2; + if (!V3.isUndef()) + Mask[3] = 3; + SDValue Shuffle = + DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask); + return Shuffle; +} + // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, @@ -11022,6 +11064,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, EVT VT = Op.getValueType(); assert(!VT.isScalableVector() && "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); + + if (SDValue S = shuffleWithSingleLoad(Op, DAG)) + return S; + unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { @@ -11048,6 +11094,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. + // SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -21269,24 +21316,23 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. - SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO); + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, + MF.getMachineMemOperand(MMO, 0, 2)); TypeSize Offset2 = TypeSize::getFixed(2); SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), MF.getMachineMemOperand(MMO, 2, 1)); - // Extend to i32. - SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); - SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); + SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16); - // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. - SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, - DAG.getConstant(16, DL, MVT::i32)); - SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr); - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16); + + SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); + SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8); - // Extract v3i8 again. - SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast, + SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast, + Trunc8, DAG.getConstant(2, DL, MVT::i64)); + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8, DAG.getConstant(0, DL, MVT::i64)); SDValue TokenFactor = DAG.getNode( ISD::TokenFactor, DL, MVT::Other, diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 275e5ac8b7062e..248aa20bab6329 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -5,10 +5,8 @@ define <16 x i8> @load_v3i8(ptr %src) { ; CHECK-LABEL: load_v3i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8: @@ -38,12 +36,9 @@ define <16 x i8> @load_v3i8(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -59,7 +54,6 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -76,12 +70,9 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_align_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -97,7 +88,6 @@ define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -141,12 +131,11 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #3] -; CHECK-NEXT: ldurh w9, [x0, #1] +; CHECK-NEXT: add x8, x0, #1 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: add x8, x0, #3 +; CHECK-NEXT: ld1.b { v0 }[2], [x8] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -162,7 +151,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #3] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -180,12 +168,11 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #5] -; CHECK-NEXT: ldurh w9, [x0, #3] +; CHECK-NEXT: add x8, x0, #3 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: add x8, x0, #5 +; CHECK-NEXT: ld1.b { v0 }[2], [x8] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -201,7 +188,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #5] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -263,7 +249,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { ; CHECK-NEXT: ldr s0, [sp, #12] ; CHECK-NEXT: ldrsb w8, [x0, #2] ; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] ; CHECK-NEXT: mov.h v0[2], w8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 @@ -281,7 +266,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -410,12 +394,9 @@ entry: define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] @@ -507,16 +488,13 @@ entry: define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_add_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrh w10, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI13_0@PAGE ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr d1, [x8, lCPI13_0@PAGEOFF] ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: orr w9, w10, w9, lsl #16 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 ; CHECK-NEXT: st1.h { v0 }[2], [x8]