[X86][SSE] Attempt to combine 64-bit and 16-bit shuffles to unary shuffles before bit shifts

We are combining shuffles to bit shifts before unary permutes, which means we can't fold loads plus the destination register is destructive

The 32-bit shuffles are a bit tricky and will be dealt with in a later patch

llvm-svn: 306977
This commit is contained in:
Simon Pilgrim 2017-07-02 13:19:10 +00:00
parent 66c4c00993
commit 490eeeb173
2 changed files with 59 additions and 70 deletions

View File

@ -27120,29 +27120,44 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
ContainsZeros |= (M == SM_SentinelZero);
}
// Attempt to match against byte/bit shifts.
// FIXME: Add 512-bit support.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
MaskScalarSizeInBits, Mask,
0, Zeroable, Subtarget);
if (0 < ShiftAmt) {
PermuteImm = (unsigned)ShiftAmt;
// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
if (!ContainsZeros && MaskScalarSizeInBits == 64) {
// Check for lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
PermuteImm = getV4X86ShuffleImm(Mask);
return true;
}
if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
return true;
}
}
} else if (AllowFloatDomain && Subtarget.hasAVX()) {
// VPERMILPD can permute with a non-repeating shuffle.
Shuffle = X86ISD::VPERMILPI;
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
PermuteImm = 0;
for (int i = 0, e = Mask.size(); i != e; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
PermuteImm |= (M & 1) << i;
}
return true;
}
}
// Ensure we don't contain any zero elements.
if (ContainsZeros)
return false;
assert(llvm::all_of(Mask, [&](int M) {
return SM_SentinelUndef <= M && M < (int)NumMaskElts;
}) && "Expected unary shuffle");
// Handle PSHUFLW/PSHUFHW repeated patterns.
if (MaskScalarSizeInBits == 16) {
// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(Mask.data() + 0, 4);
@ -27170,12 +27185,30 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
}
return false;
}
return false;
}
// Attempt to match against byte/bit shifts.
// FIXME: Add 512-bit support.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
MaskScalarSizeInBits, Mask,
0, Zeroable, Subtarget);
if (0 < ShiftAmt) {
PermuteImm = (unsigned)ShiftAmt;
return true;
}
}
// Ensure we don't contain any zero elements.
if (ContainsZeros)
return false;
assert(llvm::all_of(Mask, [&](int M) {
return SM_SentinelUndef <= M && M < (int)NumMaskElts;
}) && "Expected unary shuffle");
// We only support permutation of 32/64 bit elements after this.
if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
return false;
@ -27185,48 +27218,6 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
return false;
// Pre-AVX2 we must use float shuffles on 256-bit vectors.
if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
AllowFloatDomain = true;
AllowIntDomain = false;
}
// Check for lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
PermuteImm = getV4X86ShuffleImm(Mask);
return true;
}
if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
return true;
}
}
return false;
}
// VPERMILPD can permute with a non-repeating shuffle.
if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
Shuffle = X86ISD::VPERMILPI;
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
PermuteImm = 0;
for (int i = 0, e = Mask.size(); i != e; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
PermuteImm |= (M & 1) << i;
}
return true;
}
// We need a repeating shuffle mask for VPERMILPS/PSHUFD.
SmallVector<int, 4> RepeatedMask;
if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
@ -27578,7 +27569,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
bool AllowFloatDomain = FloatDomain || (Depth > 3);
bool AllowIntDomain = !FloatDomain || (Depth > 3);
bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load

View File

@ -445,18 +445,15 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
ret <16 x i8> %res1
}
; TODO - we could fold the load if we lowered to pshuflw instead.
define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
; SSE: # BB#0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
; AVX: # BB#0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpslld $16, %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
; AVX-NEXT: retq
%res0 = load <16 x i8>, <16 x i8> *%a0, align 16
%res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)