[AArch64] Add custom lowering for load <3 x i8>. #78632

fhahn · 2024-01-18T21:53:19Z

Add custom combine to lower load <3 x i8> as the more efficient sequence below:
ldrb wX, [x0, #2]
ldrh wY, [x0]
orr wX, wY, wX, lsl #16
fmov s0, wX

At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790

Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790

llvmbot · 2024-01-18T21:53:50Z

@llvm/pr-subscribers-backend-aarch64

Author: Florian Hahn (fhahn)

Changes

Add custom combine to lower load <3 x i8> as the more efficient sequence below:
ldrb wX, [x0, #2]
ldrh wY, [x0]
orr wX, wY, wX, lsl #16
fmov s0, wX

At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790

Full diff: https://github.com/llvm/llvm-project/pull/78632.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+52-2)
(modified) llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll (+13-31)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a6f1dc7487bae..e1139c2fede8e4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21095,6 +21095,50 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
   return SDValue();
 }
 
+// A custom combine to lower load <3 x i8> as the more efficient sequence
+// below:
+//    ldrb wX, [x0, #2]
+//    ldrh wY, [x0]
+//    orr wX, wY, wX, lsl #16
+//    fmov s0, wX
+//
+static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
+  EVT MemVT = LD->getMemoryVT();
+  if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
+      LD->getOriginalAlign() >= 4)
+    return SDValue();
+
+  SDLoc DL(LD);
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+
+  // Load 2 x i8, then 1 x i8.
+  SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
+                            LD->getOriginalAlign());
+  SDValue L8 =
+      DAG.getLoad(MVT::i8, DL, Chain,
+                  DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL),
+                  LD->getPointerInfo(), LD->getOriginalAlign());
+
+  // Extend to i32.
+  SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
+  SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+
+  // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
+  SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
+                            DAG.getConstant(16, DL, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
+
+  // Extract v3i8 again.
+  SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
+                                DAG.getConstant(0, DL, MVT::i64));
+  SDValue TokenFactor = DAG.getNode(
+      ISD::TokenFactor, DL, MVT::Other,
+      {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
+  return DAG.getMergeValues({Extract, TokenFactor}, DL);
+}
+
 // Perform TBI simplification if supported by the target and try to break up
 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
 // load instructions can be selected.
@@ -21106,10 +21150,16 @@ static SDValue performLOADCombine(SDNode *N,
     performTBISimplification(N->getOperand(1), DCI, DAG);
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  EVT MemVT = LD->getMemoryVT();
-  if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
+  if (LD->isVolatile() || !Subtarget->isLittleEndian())
+    return SDValue(N, 0);
+
+  if (SDValue Res = combineV3I8LoadExt(LD, DAG))
+    return Res;
+
+  if (!LD->isNonTemporal())
     return SDValue(N, 0);
 
+  EVT MemVT = LD->getMemoryVT();
   if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
       MemVT.getSizeInBits() % 256 == 0 ||
       256 % MemVT.getScalarSizeInBits() != 0)
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 9eeb194409df6f..7cac4134f0e159 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -5,19 +5,10 @@
 define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
 ; CHECK-LABEL: load_v3i8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    ldr s0, [sp, #12]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    umov.h w8, v0[0]
-; CHECK-NEXT:    umov.h w9, v0[1]
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    mov.b v0[1], w9
-; CHECK-NEXT:    ld1.b { v0 }[2], [x8]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; BE-LABEL: load_v3i8:
@@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
 define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: load_v3i8_to_4xi32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    ldr s0, [sp, #12]
-; CHECK-NEXT:    ldrsb w8, [x0, #2]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    mov.h v0[1], v0[1]
-; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    zip1.8b v0, v0, v0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-NEXT:    and.16b v0, v0, v1
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; BE-LABEL: load_v3i8_to_4xi32:
@@ -193,19 +179,15 @@ entry:
 define void @load_ext_to_64bits(ptr %src, ptr %dst) {
 ; CHECK-LABEL: load_ext_to_64bits:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    ldr s0, [sp, #12]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    add x8, x1, #4
+; CHECK-NEXT:    zip1.8b v0, v0, v0
 ; CHECK-NEXT:    bic.4h v0, #255, lsl #8
 ; CHECK-NEXT:    st1.h { v0 }[2], [x8]
 ; CHECK-NEXT:    str s0, [x1]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
 ; BE-LABEL: load_ext_to_64bits:

efriedma-quic · 2024-01-18T22:16:08Z

Is there some reason to prefer that sequence over a shorter sequence, like a pair of ld1r followed by a zip1? I mean, I can imagine your sequence is faster on certain CPUs, but I'd want to document the reasoning.

efriedma-quic · 2024-01-18T22:28:10Z

More variations:

#include <arm_neon.h>
uint8x8_t load_3byte_integer(char* a) {
  return vmov_n_s32(*(unsigned short*)a | (*(a+2) << 16));
}
uint8x8_t load_3byte_zip(char* a) {
  return vzip1_u16(vld1_dup_u16(a), vld1_dup_u8(a+2));
}
uint8x8_t load_3byte_insert(char* a) {
  return vld1_lane_s16(a, vld1_dup_u8(a+2), 0);
}

efriedma-quic · 2024-01-18T22:33:09Z

Actually, I guess the following is the shortest, at 2 instructions:

uint8x8_t load_3byte_insert_byte(char* a) {
  return vld1_lane_s8(a+2, vld1_dup_u16(a), 2);
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

fhahn · 2024-01-19T19:09:39Z

Actually, I guess the following is the shortest, at 2 instructions:
uint8x8_t load_3byte_insert_byte(char* a) {
  return vld1_lane_s8(a+2, vld1_dup_u16(a), 2);
}

Thanks, this is indeed more compact. I tried to massage the SelectionDAG nodes to generate it (7cc78c5) but it appears there are some cases where this results in slightly more code. I can check where those differences are coming from.

In terms of overall cycles, both sequences should be mostly equivalent on the CPUs I checked.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Extra tests for #78637 #78632

inclyc

LGTM because I just did similar thing in our (unfortunately, closed source) backend. Let's wait for aarch64 code owners :) !

efriedma-quic · 2024-01-22T23:23:34Z

Thanks, this is indeed more compact. I tried to massage the SelectionDAG nodes to generate it (7cc78c5) but it appears there are some cases where this results in slightly more code. I can check where those differences are coming from.

It looks like the INSERT_VECTOR_ELT is getting "optimized" into a BUILD_VECTOR, or something like that, instead of doing a shuffle like it does with your original sequence.

Extra tests for #78637 #78632

Update checks after adding more tests in e7b4ff8

efriedma-quic · 2024-01-23T22:04:05Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  TypeSize Offset2 = TypeSize::getFixed(2);
+  SDValue L8 = DAG.getLoad(
+      MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
+      LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2));


MachineFunction::getMachineMemOperand has an overload that takes an existing MachineMemOperand and adds an offset; that will produce a more accurate result here.

Thanks updated!

github-actions · 2024-01-25T17:05:25Z

✅ With the latest revision this PR passed the C/C++ code formatter.

fhahn · 2024-01-25T17:09:10Z

Thanks, this is indeed more compact. I tried to massage the SelectionDAG nodes to generate it (7cc78c5) but it appears there are some cases where this results in slightly more code. I can check where those differences are coming from.

It looks like the INSERT_VECTOR_ELT is getting "optimized" into a BUILD_VECTOR, or something like that, instead of doing a shuffle like it does with your original sequence.

@efriedma-quic ok I managed to track down where the issue is. The only workaround I could come up with is extending ReconstructShuffle to support the case where one element is a load via shuffleWithSingleLoad. WDYT?

Extra tests for llvm#78637 llvm#78632 (cherry-picked from ff1cde5)

Extra tests for llvm#78637 llvm#78632 (cherry-picked from e7b4ff8)

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

TNorthover

Not a fan, but if we must then I think there might still be some gaps...

TNorthover · 2024-01-29T14:53:39Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+
+  Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec,
+                    SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64));
+  Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec);


I think Vec could have quite a variety of unexpected types here (though running at a specific phase of DAG might limit that). There's no reason to expect it to have either 4 elements or for each element to be i16 just from what you've checked so far.

TNorthover · 2024-01-29T14:56:48Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  SDValue V1 = Op.getOperand(1);
+  SDValue V2 = Op.getOperand(2);
+  SDValue V3 = Op.getOperand(3);
+  if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||


This is a hyper-specific pattern. I assume it's because we are specifically looking for and only care about a single <3 x i8> instruction (a load?) and this is what it's been mangled to by the time we get to see it. If so we might have to tolerate the horror, but should at least call it out in comments.

Unfortunately yes! I couldn't find any alternative to prevent the folds that create the sub-optimal nodes. I slightly extended the comment at the top of the function. Do you think that's sufficient or should I also add one here?

TNorthover · 2024-01-29T14:59:22Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+
+  if (V0.getOperand(0) != V1.getOperand(0) ||
+      V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 ||
+      !(V3.isUndef() || V3.getConstantOperandVal(1) == 3))


We're not checking V3.getOperand(0) anywhere.

Yep, added a check, thanks!

TNorthover · 2024-01-29T15:06:58Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+  SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8);


What are these two doing? They ought to amount to a nop.

This reverts commit 667b5c1fa2527c2fe756673ea2dad54eeecc3e82.

This reverts commit 109038b.

fhahn · 2024-01-30T09:57:00Z

Thanks for taking a look @TNorthover! I tried to address the comments, but with them addressed it turned out to not really be feasible to go down that path. I changed the codegen back to use the slightly longer (but using instructions that are cheaper/less complex usually) below, with a comment about the alternative sequence using ld1 and why it is not used at the moment. WDYT?

ldrb wX, [x0, #2]
ldrh wY, [x0]
orr wX, wY, wX, lsl #16
fmov s0, wX

TNorthover · 2024-01-30T12:33:31Z

I'm happier with that, I think. Just one typo I spotted in the new version but no need to reupload.

TNorthover · 2024-01-30T12:28:52Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
+
+  // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
+  SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,


Mismatch between name and operation.

Thanks, adjusted! Planning to land this once the pre-commit checks pass.

Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 (cherry-picked from d1e162e)

fhahn requested review from aemerson, TNorthover, davemgreen and efriedma-quic January 18, 2024 21:53

llvmbot added the backend:AArch64 label Jan 18, 2024

inclyc reviewed Jan 19, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

inclyc reviewed Jan 19, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

!fixup adjust alignment and pointer info

192233f

fhahn mentioned this pull request Jan 19, 2024

[SLP] Initial vectorization of non-power-of-2 ops. #77790

Merged

inclyc reviewed Jan 20, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

Merge branch 'main' into aarch64-lower-load-v3i8

04bd1e5

fhahn added a commit that referenced this pull request Jan 22, 2024

[AArch64] Add vec3 load/store tests with GEPs with const offsets.

ff1cde5

Extra tests for #78637 #78632

fhahn added 2 commits January 22, 2024 16:05

Merge branch 'main' into aarch64-lower-load-v3i8

d35da6a

!fixup add offset assert and update new tests.

39d6794

inclyc approved these changes Jan 22, 2024

View reviewed changes

fhahn added a commit that referenced this pull request Jan 23, 2024

[AArch64] Add vec3 tests with add between load and store.

e7b4ff8

Extra tests for #78637 #78632

fhahn added 2 commits January 23, 2024 14:15

Merge branch 'main' into aarch64-lower-load-v3i8

748f706

!fixup update on top of new test coverage.

e96af2f

Update checks after adding more tests in e7b4ff8

efriedma-quic reviewed Jan 23, 2024

View reviewed changes

fhahn added 2 commits January 24, 2024 20:22

Merge branch 'main' into aarch64-lower-load-v3i8

9800b2c

!fixup update tests and use MMO.

7e2bf68

Try using LD1r.

109038b

!fixup fix formatting

e6d5725

fhahn added a commit to fhahn/llvm-project that referenced this pull request Jan 25, 2024

[AArch64] Add vec3 load/store tests with GEPs with const offsets.

8af7a14

Extra tests for llvm#78637 llvm#78632 (cherry-picked from ff1cde5)

fhahn added a commit to fhahn/llvm-project that referenced this pull request Jan 25, 2024

[AArch64] Add vec3 tests with add between load and store.

44f385a

Extra tests for llvm#78637 llvm#78632 (cherry-picked from e7b4ff8)

inclyc reviewed Jan 26, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Show resolved Hide resolved

TNorthover reviewed Jan 29, 2024

View reviewed changes

fhahn added 4 commits January 30, 2024 09:23

Merge branch 'main' into aarch64-lower-load-v3i8

491f56d

Revert "!fixup fix formatting"

fac6324

This reverts commit 667b5c1fa2527c2fe756673ea2dad54eeecc3e82.

Revert "Try using LD1r."

ebb84fc

This reverts commit 109038b.

Add note about alternative sequence.

c1013f8

TNorthover approved these changes Jan 30, 2024

View reviewed changes

fhahn added 2 commits January 30, 2024 13:22

Merge branch 'main' into aarch64-lower-load-v3i8

445d9be

!fixup fix naming + update new tests.

ca48e78

fhahn merged commit d1e162e into llvm:main Jan 30, 2024
3 of 4 checks passed

fhahn deleted the aarch64-lower-load-v3i8 branch January 30, 2024 14:04

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Add custom lowering for load <3 x i8>. #78632

[AArch64] Add custom lowering for load <3 x i8>. #78632

fhahn commented Jan 18, 2024

llvmbot commented Jan 18, 2024

efriedma-quic commented Jan 18, 2024

efriedma-quic commented Jan 18, 2024

efriedma-quic commented Jan 18, 2024

fhahn commented Jan 19, 2024

inclyc left a comment

efriedma-quic commented Jan 22, 2024

efriedma-quic Jan 23, 2024

fhahn Jan 24, 2024

github-actions bot commented Jan 25, 2024 •

edited

Loading

fhahn commented Jan 25, 2024

TNorthover left a comment

TNorthover Jan 29, 2024

TNorthover Jan 29, 2024

fhahn Jan 29, 2024

TNorthover Jan 29, 2024

fhahn Jan 29, 2024

TNorthover Jan 29, 2024

fhahn commented Jan 30, 2024

TNorthover commented Jan 30, 2024

TNorthover Jan 30, 2024

fhahn Jan 30, 2024

		SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
		SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8);

[AArch64] Add custom lowering for load <3 x i8>. #78632

[AArch64] Add custom lowering for load <3 x i8>. #78632

Conversation

fhahn commented Jan 18, 2024

llvmbot commented Jan 18, 2024

efriedma-quic commented Jan 18, 2024

efriedma-quic commented Jan 18, 2024

efriedma-quic commented Jan 18, 2024

fhahn commented Jan 19, 2024

inclyc left a comment

Choose a reason for hiding this comment

efriedma-quic commented Jan 22, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Jan 25, 2024 • edited Loading

fhahn commented Jan 25, 2024

TNorthover left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

fhahn commented Jan 30, 2024

TNorthover commented Jan 30, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Jan 25, 2024 •

edited

Loading