[VectorCombine] foldBitcastShuf - add support for length changing shuffles

Allow length changing shuffle masks in the "bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'" fold.

It also exposes some poor shuffle mask detection for extract/insert subvector cases inside improveShuffleKindFromMask

First stage towards addressing Issue #67803
This commit is contained in:
Simon Pilgrim 2023-10-06 11:35:47 +01:00
parent 3bae69ec8c
commit 94795a37e8
3 changed files with 69 additions and 31 deletions

View File

@ -689,15 +689,18 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
// mask for scalable type is a splat or not.
// 2) Disallow non-vector casts and length-changing shuffles.
// 2) Disallow non-vector casts.
// TODO: We could allow any shuffle.
auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
if (!SrcTy || I.getOperand(0)->getType() != SrcTy)
if (!DestTy || !SrcTy)
return false;
auto *DestTy = cast<FixedVectorType>(I.getType());
unsigned DestEltSize = DestTy->getScalarSizeInBits();
unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
return false;
SmallVector<int, 16> NewMask;
if (DestEltSize <= SrcEltSize) {
// The bitcast is from wide to narrow/equal elements. The shuffle mask can
@ -714,10 +717,15 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
return false;
}
// Bitcast the shuffle src - keep its original width but using the destination
// scalar type.
unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
// The new shuffle must not cost more than the old shuffle. The bitcast is
// moved ahead of the shuffle, so assume that it has the same cost as before.
InstructionCost DestCost = TTI.getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);
TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask);
InstructionCost SrcCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
if (DestCost > SrcCost || !DestCost.isValid())
@ -725,7 +733,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
++NumShufOfBitcast;
Value *CastV = Builder.CreateBitCast(V, DestTy);
Value *CastV = Builder.CreateBitCast(V, ShuffleTy);
Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
replaceValue(I, *Shuf);
return true;

View File

@ -33,13 +33,18 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
ret <4 x float> %r
}
; TODO - length-changing shuffle
; Length-changing shuffles
define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
%r = bitcast <4 x i32> %shuf to <16 x i8>
@ -47,10 +52,15 @@ define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
}
define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; CHECK-NEXT: ret <16 x i16> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; SSE-NEXT: ret <16 x i16> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: ret <16 x i16> [[R]]
;
%shuf = shufflevector <2 x i64> %v, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%r = bitcast <4 x i64> %shuf to <16 x i16>
@ -58,10 +68,15 @@ define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
}
define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_extract_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_extract_subvector(
; SSE-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
; SSE-NEXT: [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_extract_subvector(
; AVX-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = bitcast <4 x i32> %shuf to <16 x i8>

View File

@ -33,13 +33,18 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
ret <4 x float> %r
}
; TODO - Length-changing shuffle
; Length-changing shuffles
define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
%r = bitcast <4 x i32> %shuf to <16 x i8>
@ -47,10 +52,15 @@ define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
}
define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
; CHECK-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; CHECK-NEXT: ret <16 x i16> [[R]]
; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; SSE-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
; SSE-NEXT: ret <16 x i16> [[R]]
;
; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: ret <16 x i16> [[R]]
;
%shuf = shufflevector <2 x i64> %v, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%r = bitcast <4 x i64> %shuf to <16 x i16>
@ -58,10 +68,15 @@ define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
}
define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
; CHECK-LABEL: @bitcast_shuf_extract_subvector(
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; CHECK-NEXT: ret <16 x i8> [[R]]
; SSE-LABEL: @bitcast_shuf_extract_subvector(
; SSE-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
; SSE-NEXT: [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SSE-NEXT: ret <16 x i8> [[R]]
;
; AVX-LABEL: @bitcast_shuf_extract_subvector(
; AVX-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
; AVX-NEXT: ret <16 x i8> [[R]]
;
%shuf = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = bitcast <4 x i32> %shuf to <16 x i8>