-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] - Fold and and cmp into tst #110347
Conversation
c2352ab
to
975e890
Compare
@llvm/pr-subscribers-backend-aarch64 Author: Jorge Botto (jf-botto) ChangesFixes #102703. https://godbolt.org/z/nfj8xsb1Y The following pattern:
is optimised by instcombine into:
However, post instcombine leads to worse aarch64 than the unoptimised version. Pre instcombine:
Post instcombine:
In the unoptimised version, SelectionDAG converts In the optimised version, SelectionDAG converts This PR adds an optimisation to Full diff: https://github.com/llvm/llvm-project/pull/110347.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4166d9bd22bc01..3f0282e8689dc4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4299,6 +4299,36 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
Op.getOperand(1));
}
+// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
+// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) which produces a
+// better opt with EmitComparison.
+static void SimplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG, const SDLoc DL) {
+ switch (CC) {
+ default:
+ break;
+ case ISD::SETULT:
+ if (LHS.getOpcode() == ISD::AND) {
+ ConstantSDNode *LHSAndConst = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ if (LHSAndConst && RHSConst && LHSAndConst->hasOneUse() &&
+ RHSConst->hasOneUse()) {
+ uint64_t LHSAndConstValue = LHSAndConst->getZExtValue();
+ uint64_t RHSConstValue = RHSConst->getZExtValue();
+ if (isPowerOf2_64(RHSConstValue)) {
+ uint64_t NewMaskValue = LHSAndConstValue & ~(RHSConstValue - 1);
+ LHS = DAG.getNode(
+ ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
+ DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
+ CC = ISD::SETEQ;
+ }
+ }
+ }
+ break;
+ }
+}
+
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -10587,6 +10617,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
if (LHS.getValueType().isInteger()) {
+
+ SimplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
+
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(
LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
diff --git a/llvm/test/CodeGen/AArch64/pr102703.ll b/llvm/test/CodeGen/AArch64/pr102703.ll
new file mode 100644
index 00000000000000..dcf2d65ce27368
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr102703.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O1 -mtriple=aarch64 | FileCheck %s
+
+define i1 @lt2_u8(i8 %0) {
+; CHECK-LABEL: lt2_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xfe
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 2
+ ret i1 %2
+}
+
+define i1 @lt4_u8(i8 %0) {
+; CHECK-LABEL: lt4_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xfc
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 4
+ ret i1 %2
+}
+
+define i1 @lt8_u8(i8 %0) {
+; CHECK-LABEL: lt8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xf8
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 8
+ ret i1 %2
+}
+
+define i1 @lt16_u8(i8 %0) {
+; CHECK-LABEL: lt16_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xf0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 16
+ ret i1 %2
+}
+
+define i1 @lt32_u8(i8 %0) {
+; CHECK-LABEL: lt32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xe0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 32
+ ret i1 %2
+}
+
+define i1 @lt64_u8(i8 %0) {
+; CHECK-LABEL: lt64_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xc0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 64
+ ret i1 %2
+}
+
+; negative test
+define i1 @lt3_u8(i8 %0) {
+; CHECK-LABEL: lt3_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: cmp w8, #3
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 3
+ ret i1 %2
+}
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index bb4df6d8935b1b..7c80f9320faec1 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -287,9 +287,8 @@ define i1 @add_ultcmp_bad_i16_i8_add(i16 %x, i16 %y) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i8_add:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, w1
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, %y
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -328,9 +327,8 @@ define i1 @add_ultcmp_bad_i16_i8_c0notpoweroftwo(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #192
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 192 ; (1U << (8-1)) + (1U << (8-1-1))
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -356,9 +354,8 @@ define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i8_magic:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #64
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 64 ; 1U << (8-1-1)
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -370,9 +367,8 @@ define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i4:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #8
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #16
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xfff0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 8 ; 1U << (4-1)
%tmp1 = icmp ult i16 %tmp0, 16 ; 1U << 4
@@ -384,9 +380,8 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i24_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #128
-; CHECK-NEXT: and w8, w8, #0xffffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xffff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i24 %x, 128 ; 1U << (8-1)
%tmp1 = icmp ult i24 %tmp0, 256 ; 1U << 8
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); | ||
ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); | ||
if (LHSConstOp && RHSConst) { | ||
uint64_t lhsConstValue = LHSConstOp->getZExtValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LLVM Capitalized variable names, so maybe use LHSConstValue and RHSConstValue. Same for NewMaskValue below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apologies, fixed it.
Thank you for the review and your comments @davemgreen, I really appreciate it as this is my first aarch64 pr. |
Fixes llvm#102703. https://godbolt.org/z/nfj8xsb1Y The following pattern: ``` %2 = and i32 %0, 254 %3 = icmp eq i32 %2, 0 ``` is optimised by instcombine into: ```%3 = icmp ult i32 %0, 2``` However, post instcombine leads to worse aarch64 than the unoptimised version. Pre instcombine: ``` tst w0, #0xfe cset w0, eq ret ``` Post instcombine: ``` and w8, w0, #0xff cmp w8, llvm#2 cset w0, lo ret ``` In the unoptimised version, SelectionDAG converts `SETCC (AND X 254) 0 EQ` into `CSEL 0 1 1 (ANDS X 254)`, which gets emitted as a `tst`. In the optimised version, SelectionDAG converts `SETCC (AND X 255) 2 ULT` into `CSEL 0 1 2 (SUBS (AND X 255) 2)`, which gets emitted as an `and`/`cmp`. This PR adds an optimisation to `AArch64ISelLowering`, converting `SETCC (AND X Y) Z ULT` into `SETCC (AND X (Y & ~(Z - 1))) 0 EQ` when `Z` is a power of two. This makes SelectionDAG/Codegen produce the same optimised code for both examples.
Fixes #102703.
https://godbolt.org/z/nfj8xsb1Y
The following pattern:
is optimised by instcombine into:
%3 = icmp ult i32 %0, 2
However, post instcombine leads to worse aarch64 than the unoptimised version.
Pre instcombine:
Post instcombine:
In the unoptimised version, SelectionDAG converts
SETCC (AND X 254) 0 EQ
intoCSEL 0 1 1 (ANDS X 254)
, which gets emitted as atst
.In the optimised version, SelectionDAG converts
SETCC (AND X 255) 2 ULT
intoCSEL 0 1 2 (SUBS (AND X 255) 2)
, which gets emitted as anand
/cmp
.This PR adds an optimisation to
AArch64ISelLowering
, convertingSETCC (AND X Y) Z ULT
intoSETCC (AND X (Y & ~(Z - 1))) 0 EQ
whenZ
is a power of two. This makes SelectionDAG/Codegen produce the same optimised code for both examples.