Skip to content

Commit

Permalink
[VectorCombine] foldBitcastShuf - add support for length changing shu…
Browse files Browse the repository at this point in the history
…ffles

Allow length changing shuffle masks in the "bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'" fold.

It also exposes some poor shuffle mask detection for extract/insert subvector cases inside improveShuffleKindFromMask

First stage towards addressing Issue llvm#67803
  • Loading branch information
RKSimon committed Oct 6, 2023
1 parent 3bae69e commit 94795a3
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 31 deletions.
18 changes: 13 additions & 5 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -689,15 +689,18 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
// mask for scalable type is a splat or not.
// 2) Disallow non-vector casts and length-changing shuffles.
// 2) Disallow non-vector casts.
// TODO: We could allow any shuffle.
auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
if (!SrcTy || I.getOperand(0)->getType() != SrcTy)
if (!DestTy || !SrcTy)
return false;

auto *DestTy = cast<FixedVectorType>(I.getType());
unsigned DestEltSize = DestTy->getScalarSizeInBits();
unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
return false;

SmallVector<int, 16> NewMask;
if (DestEltSize <= SrcEltSize) {
// The bitcast is from wide to narrow/equal elements. The shuffle mask can
Expand All @@ -714,18 +717,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
return false;
}

// Bitcast the shuffle src - keep its original width but using the destination
// scalar type.
unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);

// The new shuffle must not cost more than the old shuffle. The bitcast is
// moved ahead of the shuffle, so assume that it has the same cost as before.
InstructionCost DestCost = TTI.getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);
TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask);
InstructionCost SrcCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
if (DestCost > SrcCost || !DestCost.isValid())
return false;

// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
++NumShufOfBitcast;
Value *CastV = Builder.CreateBitCast(V, DestTy);
Value *CastV = Builder.CreateBitCast(V, ShuffleTy);
Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
replaceValue(I, *Shuf);
return true;
Expand Down
41 changes: 28 additions & 13 deletions llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,35 +33,50 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
ret <4 x float> %r
}

; TODO - length-changing shuffle
; Length-changing shuffles

define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
%r = bitcast <4 x i32> %shuf to <16 x i8>
ret <16 x i8> %r
}

define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; CHECK-NEXT: ret <16 x i16> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; SSE-NEXT: ret <16 x i16> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: ret <16 x i16> [[R]]
;
%shuf = shufflevector <2 x i64> %v, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%r = bitcast <4 x i64> %shuf to <16 x i16>
ret <16 x i16> %r
}

define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_extract_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_extract_subvector(
; SSE-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
; SSE-NEXT: [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_extract_subvector(
; AVX-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = bitcast <4 x i32> %shuf to <16 x i8>
Expand Down
41 changes: 28 additions & 13 deletions llvm/test/Transforms/VectorCombine/X86/shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,35 +33,50 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
ret <4 x float> %r
}

; TODO - Length-changing shuffle
; Length-changing shuffles

define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
%r = bitcast <4 x i32> %shuf to <16 x i8>
ret <16 x i8> %r
}

define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; CHECK-NEXT: ret <16 x i16> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; SSE-NEXT: ret <16 x i16> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: ret <16 x i16> [[R]]
;
%shuf = shufflevector <2 x i64> %v, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%r = bitcast <4 x i64> %shuf to <16 x i16>
ret <16 x i16> %r
}

define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_extract_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_extract_subvector(
; SSE-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
; SSE-NEXT: [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_extract_subvector(
; AVX-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = bitcast <4 x i32> %shuf to <16 x i8>
Expand Down

0 comments on commit 94795a3

Please sign in to comment.