mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-11 21:45:16 +00:00
[X86][SSE] Attempt to combine 64-bit and 16-bit shuffles to unary shuffles before bit shifts
We are combining shuffles to bit shifts before unary permutes, which means we can't fold loads plus the destination register is destructive The 32-bit shuffles are a bit tricky and will be dealt with in a later patch llvm-svn: 306977
This commit is contained in:
parent
66c4c00993
commit
490eeeb173
@ -27120,29 +27120,44 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
||||
ContainsZeros |= (M == SM_SentinelZero);
|
||||
}
|
||||
|
||||
// Attempt to match against byte/bit shifts.
|
||||
// FIXME: Add 512-bit support.
|
||||
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
|
||||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
|
||||
int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
|
||||
MaskScalarSizeInBits, Mask,
|
||||
0, Zeroable, Subtarget);
|
||||
if (0 < ShiftAmt) {
|
||||
PermuteImm = (unsigned)ShiftAmt;
|
||||
// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
|
||||
if (!ContainsZeros && MaskScalarSizeInBits == 64) {
|
||||
// Check for lane crossing permutes.
|
||||
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
|
||||
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
|
||||
if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
|
||||
Shuffle = X86ISD::VPERMI;
|
||||
ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
|
||||
PermuteImm = getV4X86ShuffleImm(Mask);
|
||||
return true;
|
||||
}
|
||||
if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
|
||||
SmallVector<int, 4> RepeatedMask;
|
||||
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
|
||||
Shuffle = X86ISD::VPERMI;
|
||||
ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
|
||||
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} else if (AllowFloatDomain && Subtarget.hasAVX()) {
|
||||
// VPERMILPD can permute with a non-repeating shuffle.
|
||||
Shuffle = X86ISD::VPERMILPI;
|
||||
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
|
||||
PermuteImm = 0;
|
||||
for (int i = 0, e = Mask.size(); i != e; ++i) {
|
||||
int M = Mask[i];
|
||||
if (M == SM_SentinelUndef)
|
||||
continue;
|
||||
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
|
||||
PermuteImm |= (M & 1) << i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure we don't contain any zero elements.
|
||||
if (ContainsZeros)
|
||||
return false;
|
||||
|
||||
assert(llvm::all_of(Mask, [&](int M) {
|
||||
return SM_SentinelUndef <= M && M < (int)NumMaskElts;
|
||||
}) && "Expected unary shuffle");
|
||||
|
||||
// Handle PSHUFLW/PSHUFHW repeated patterns.
|
||||
if (MaskScalarSizeInBits == 16) {
|
||||
// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
|
||||
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
|
||||
SmallVector<int, 4> RepeatedMask;
|
||||
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
|
||||
ArrayRef<int> LoMask(Mask.data() + 0, 4);
|
||||
@ -27170,12 +27185,30 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
||||
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Attempt to match against byte/bit shifts.
|
||||
// FIXME: Add 512-bit support.
|
||||
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
|
||||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
|
||||
int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
|
||||
MaskScalarSizeInBits, Mask,
|
||||
0, Zeroable, Subtarget);
|
||||
if (0 < ShiftAmt) {
|
||||
PermuteImm = (unsigned)ShiftAmt;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure we don't contain any zero elements.
|
||||
if (ContainsZeros)
|
||||
return false;
|
||||
|
||||
assert(llvm::all_of(Mask, [&](int M) {
|
||||
return SM_SentinelUndef <= M && M < (int)NumMaskElts;
|
||||
}) && "Expected unary shuffle");
|
||||
|
||||
// We only support permutation of 32/64 bit elements after this.
|
||||
if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
|
||||
return false;
|
||||
@ -27185,48 +27218,6 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
||||
if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
|
||||
return false;
|
||||
|
||||
// Pre-AVX2 we must use float shuffles on 256-bit vectors.
|
||||
if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
|
||||
AllowFloatDomain = true;
|
||||
AllowIntDomain = false;
|
||||
}
|
||||
|
||||
// Check for lane crossing permutes.
|
||||
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
|
||||
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
|
||||
if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
|
||||
Shuffle = X86ISD::VPERMI;
|
||||
ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
|
||||
PermuteImm = getV4X86ShuffleImm(Mask);
|
||||
return true;
|
||||
}
|
||||
if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
|
||||
SmallVector<int, 4> RepeatedMask;
|
||||
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
|
||||
Shuffle = X86ISD::VPERMI;
|
||||
ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
|
||||
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// VPERMILPD can permute with a non-repeating shuffle.
|
||||
if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
|
||||
Shuffle = X86ISD::VPERMILPI;
|
||||
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
|
||||
PermuteImm = 0;
|
||||
for (int i = 0, e = Mask.size(); i != e; ++i) {
|
||||
int M = Mask[i];
|
||||
if (M == SM_SentinelUndef)
|
||||
continue;
|
||||
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
|
||||
PermuteImm |= (M & 1) << i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// We need a repeating shuffle mask for VPERMILPS/PSHUFD.
|
||||
SmallVector<int, 4> RepeatedMask;
|
||||
if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
|
||||
@ -27578,7 +27569,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
||||
// Which shuffle domains are permitted?
|
||||
// Permit domain crossing at higher combine depths.
|
||||
bool AllowFloatDomain = FloatDomain || (Depth > 3);
|
||||
bool AllowIntDomain = !FloatDomain || (Depth > 3);
|
||||
bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
|
||||
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
|
||||
|
||||
if (UnaryShuffle) {
|
||||
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
|
||||
|
@ -445,18 +445,15 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
|
||||
ret <16 x i8> %res1
|
||||
}
|
||||
|
||||
; TODO - we could fold the load if we lowered to pshuflw instead.
|
||||
define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
|
||||
; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movdqa (%rdi), %xmm0
|
||||
; SSE-NEXT: pslld $16, %xmm0
|
||||
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX-NEXT: vpslld $16, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
|
||||
; AVX-NEXT: retq
|
||||
%res0 = load <16 x i8>, <16 x i8> *%a0, align 16
|
||||
%res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
|
||||
|
Loading…
Reference in New Issue
Block a user