From add95d3f8da3d36add548dafe3e9ed8db5626422 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Fri, 5 Aug 2022 10:49:15 -0700 Subject: [PATCH 1/4] Add tests asserting existing behavior for `icmp slt` --- cranelift/filetests/filetests/isa/x64/b1.clif | 32 ++++++++++ .../filetests/filetests/isa/x64/branches.clif | 60 +++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif index eb971b36fa5e..b6025f48ce31 100644 --- a/cranelift/filetests/filetests/isa/x64/b1.clif +++ b/cranelift/filetests/filetests/isa/x64/b1.clif @@ -73,3 +73,35 @@ block2: ; popq %rbp ; ret +function %f3(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp slt v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpq $0, %rdi +; setl %al +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp slt v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpl $0, %edi +; setl %al +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/branches.clif b/cranelift/filetests/filetests/isa/x64/branches.clif index 4b4a587b6b00..ecb880084274 100644 --- a/cranelift/filetests/filetests/isa/x64/branches.clif +++ b/cranelift/filetests/filetests/isa/x64/branches.clif @@ -223,3 +223,63 @@ block2: ; popq %rbp ; ret +function %f6(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp slt v0, v1 + brnz v2, block1 + jump block2 +block1: + v3 = bconst.b1 true + return v3 +block2: + v4 = bconst.b1 false + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpq $0, %rdi +; jl label1; j label2 +; block1: +; movl $1, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; block2: +; xorl %eax, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f7(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp slt v0, v1 + brnz v2, block1 + jump block2 +block1: + v3 = bconst.b1 true + return v3 +block2: + v4 = bconst.b1 false + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpl $0, %edi +; jl label1; j label2 +; block1: +; movl $1, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; block2: +; xorl %eax, %eax, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + From 856cd9117ce3854d817fd59842570fad701b5836 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Fri, 5 Aug 2022 11:21:59 -0700 Subject: [PATCH 2/4] Add a peephole optimization for `x < 0` --- cranelift/codegen/src/isa/x64/lower.isle | 8 ++++++++ cranelift/filetests/filetests/isa/x64/b1.clif | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index a11fa45dd379..07bf67b47dea 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1501,6 +1501,14 @@ (rule (lower (icmp cc a @ (value_type $I128) b)) (lower_icmp_bool (emit_cmp cc a b))) +;; Peephole optimization for `x < 0`, when x is a signed 64 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) a @ (value_type $I64) (u64_from_iconst 0)))) + (x64_shr $I64 a (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `x < 0`, when x is a signed 32 bit value +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) a @ (value_type $I32) (u64_from_iconst 0)))) + (x64_shr $I32 a (Imm8Reg.Imm8 31))) + ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than ;; one. To note: what is different here about the output values is that each ;; lane will be filled with all 1s or all 0s according to the comparison, diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif index b6025f48ce31..f302eb431999 100644 --- a/cranelift/filetests/filetests/isa/x64/b1.clif +++ b/cranelift/filetests/filetests/isa/x64/b1.clif @@ -83,8 +83,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cmpq $0, %rdi -; setl %al +; shrq $63, %rdi, %rdi +; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp ; ret @@ -99,8 +99,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cmpl $0, %edi -; setl %al +; shrl $31, %edi, %edi +; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp ; ret From 81998a81298331b453dfe16bdbc5a76c59203e52 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Fri, 5 Aug 2022 14:52:13 -0700 Subject: [PATCH 3/4] Add additional rules and more tests --- cranelift/codegen/src/isa/x64/lower.isle | 32 +++++- cranelift/filetests/filetests/isa/x64/b1.clif | 107 +++++++++++++++++- 2 files changed, 131 insertions(+), 8 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 07bf67b47dea..498b69b7ec90 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1502,12 +1502,36 @@ (lower_icmp_bool (emit_cmp cc a b))) ;; Peephole optimization for `x < 0`, when x is a signed 64 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) a @ (value_type $I64) (u64_from_iconst 0)))) - (x64_shr $I64 a (Imm8Reg.Imm8 63))) +(rule (lower (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0))) + (x64_sar $I64 x (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `0 > x`, when x is a signed 64 bit value +(rule (lower (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64))) + (x64_sar $I64 x (Imm8Reg.Imm8 63))) + +;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value +(rule (lower (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64))) + (x64_not $I64 (x64_sar $I64 x (Imm8Reg.Imm8 63)))) + +;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value +(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0))) + (x64_not $I64 (x64_sar $I64 x (Imm8Reg.Imm8 63)))) ;; Peephole optimization for `x < 0`, when x is a signed 32 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) a @ (value_type $I32) (u64_from_iconst 0)))) - (x64_shr $I32 a (Imm8Reg.Imm8 31))) +(rule (lower (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0))) + (x64_sar $I32 x (Imm8Reg.Imm8 31))) + +;; Peephole optimization for `0 > x`, when x is a signed 32 bit value +(rule (lower (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32))) + (x64_sar $I32 x (Imm8Reg.Imm8 31))) + +;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value +(rule (lower (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32))) + (x64_not $I32 (x64_sar $I32 x (Imm8Reg.Imm8 31)))) + +;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value +(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0))) + (x64_not $I32 (x64_sar $I32 x (Imm8Reg.Imm8 31)))) ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than ;; one. To note: what is different here about the output values is that each diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif index f302eb431999..481db0c9f7d2 100644 --- a/cranelift/filetests/filetests/isa/x64/b1.clif +++ b/cranelift/filetests/filetests/isa/x64/b1.clif @@ -73,7 +73,7 @@ block2: ; popq %rbp ; ret -function %f3(i64) -> b1 { +function %test_x_slt_0_i64(i64) -> b1 { block0(v0: i64): v1 = iconst.i64 0 v2 = icmp slt v0, v1 @@ -83,13 +83,13 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; shrq $63, %rdi, %rdi +; sarq $63, %rdi, %rdi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp ; ret -function %f4(i32) -> b1 { +function %test_x_slt_0_i32f4(i32) -> b1 { block0(v0: i32): v1 = iconst.i32 0 v2 = icmp slt v0, v1 @@ -99,9 +99,108 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; shrl $31, %edi, %edi +; sarl $31, %edi, %edi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp ; ret +function %test_0_sgt_x_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sgt v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarq $63, %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sgt_x_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp sgt v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarl $31, %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sle_x_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sle v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarq $63, %rdi, %rdi +; notq %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_0_sle_x_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp sle v1, v0 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarl $31, %edi, %edi +; notl %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_x_sge_x_i64(i64) -> b1 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sge v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarq $63, %rdi, %rdi +; notq %rdi, %rdi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %test_x_sge_x_i32f4(i32) -> b1 { +block0(v0: i32): + v1 = iconst.i32 0 + v2 = icmp sge v0, v1 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarl $31, %edi, %edi +; notl %edi, %edi +; movq %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret From 74c5ebe14c7a0dfc71ab6dfb683d3ecae46bf541 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Mon, 8 Aug 2022 10:47:53 -0700 Subject: [PATCH 4/4] Restrict this optimization to B1 and revert back to shr --- cranelift/codegen/src/isa/x64/lower.isle | 32 +++++++++---------- cranelift/filetests/filetests/isa/x64/b1.clif | 21 ++++++------ 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 498b69b7ec90..5188dd322af4 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1502,36 +1502,36 @@ (lower_icmp_bool (emit_cmp cc a b))) ;; Peephole optimization for `x < 0`, when x is a signed 64 bit value -(rule (lower (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0))) - (x64_sar $I64 x (Imm8Reg.Imm8 63))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0)))) + (x64_shr $I64 x (Imm8Reg.Imm8 63))) ;; Peephole optimization for `0 > x`, when x is a signed 64 bit value -(rule (lower (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64))) - (x64_sar $I64 x (Imm8Reg.Imm8 63))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64)))) + (x64_shr $I64 x (Imm8Reg.Imm8 63))) ;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value -(rule (lower (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64))) - (x64_not $I64 (x64_sar $I64 x (Imm8Reg.Imm8 63)))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64)))) + (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) ;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value -(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0))) - (x64_not $I64 (x64_sar $I64 x (Imm8Reg.Imm8 63)))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0)))) + (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) ;; Peephole optimization for `x < 0`, when x is a signed 32 bit value -(rule (lower (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0))) - (x64_sar $I32 x (Imm8Reg.Imm8 31))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0)))) + (x64_shr $I32 x (Imm8Reg.Imm8 31))) ;; Peephole optimization for `0 > x`, when x is a signed 32 bit value -(rule (lower (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32))) - (x64_sar $I32 x (Imm8Reg.Imm8 31))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32)))) + (x64_shr $I32 x (Imm8Reg.Imm8 31))) ;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value -(rule (lower (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32))) - (x64_not $I32 (x64_sar $I32 x (Imm8Reg.Imm8 31)))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32)))) + (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) ;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value -(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0))) - (x64_not $I32 (x64_sar $I32 x (Imm8Reg.Imm8 31)))) +(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0)))) + (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than ;; one. To note: what is different here about the output values is that each diff --git a/cranelift/filetests/filetests/isa/x64/b1.clif b/cranelift/filetests/filetests/isa/x64/b1.clif index 481db0c9f7d2..a67242437054 100644 --- a/cranelift/filetests/filetests/isa/x64/b1.clif +++ b/cranelift/filetests/filetests/isa/x64/b1.clif @@ -83,7 +83,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarq $63, %rdi, %rdi +; shrq $63, %rdi, %rdi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -99,7 +99,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarl $31, %edi, %edi +; shrl $31, %edi, %edi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -115,7 +115,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarq $63, %rdi, %rdi +; shrq $63, %rdi, %rdi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -131,7 +131,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarl $31, %edi, %edi +; shrl $31, %edi, %edi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -147,8 +147,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarq $63, %rdi, %rdi ; notq %rdi, %rdi +; shrq $63, %rdi, %rdi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -164,8 +164,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarl $31, %edi, %edi -; notl %edi, %edi +; notq %rdi, %rdi +; shrl $31, %edi, %edi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -181,8 +181,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarq $63, %rdi, %rdi ; notq %rdi, %rdi +; shrq $63, %rdi, %rdi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp @@ -198,9 +198,10 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; sarl $31, %edi, %edi -; notl %edi, %edi +; notq %rdi, %rdi +; shrl $31, %edi, %edi ; movq %rdi, %rax ; movq %rbp, %rsp ; popq %rbp ; ret +