diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 3d1e35dff0a2..e33c5ee784be 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3592,6 +3592,14 @@ (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100))) (x64_punpcklqdq a b)) +;; If the vector shift mask is all 0s then that means the first byte of the +;; first operand is broadcast to all bytes. Falling through would load an +;; all-zeros constant from a rip-relative location but it should be slightly +;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero +;; register. +(rule 6 (lower (shuffle a _ (u128_from_immediate 0))) + (x64_pshufb a (xmm_zero $I8X16))) + ;; Special case for the `shufps` instruction which will select two 32-bit values ;; from the first operand and two 32-bit values from the second operand. Note ;; that there is a second case here as well for when the operands can be diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif index 6de850a16de0..b056d9f1686c 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -616,3 +616,30 @@ block0(v0: i16x8, v1: i16x8): ; popq %rbp ; retq +function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pxor %xmm3, %xmm3, %xmm3 +; pshufb %xmm0, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pxor %xmm3, %xmm3 +; pshufb %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index 4d1f45c2f77c..60b515628d36 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -251,3 +251,10 @@ block0(v0: i16x8, v1: i16x8): return v5 } ; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14] + +function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} +; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]