[X86][SSSE3] Bailout of lowerVectorShuffleAsPermuteAndUnpack for shuffle-with-zero (PR40306)

If we have PSHUFB and we're shuffling with a zero vector, then we are better off not doing VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

llvm-svn: 351103
This commit is contained in:
Simon Pilgrim 2019-01-14 19:07:26 +00:00
parent 6cc44ba56f
commit 459fed92ba
2 changed files with 19 additions and 11 deletions

View File

@ -11964,10 +11964,9 @@ static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
static SDValue lowerVectorShuffleAsPermuteAndUnpack(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
"This routine only supports integer vectors.");
assert(VT.is128BitVector() &&
@ -12036,6 +12035,13 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
return Unpack;
// If we have PSHUFB, and we're shuffling with a zero vector then we're
// better off not doing VECTOR_SHUFFLE(UNPCK()) as we lose track of those
// zero elements.
if (Subtarget.hasSSSE3() && (ISD::isBuildVectorAllZeros(V1.getNode()) ||
ISD::isBuildVectorAllZeros(V2.getNode())))
return SDValue();
// If none of the unpack-rooted lowerings worked (or were profitable) try an
// initial unpack.
if (NumLoInputs == 0 || NumHiInputs == 0) {
@ -12549,7 +12555,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
DL, MVT::v4i32, V1, V2, Mask, DAG))
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Unpack;
}
@ -13245,8 +13251,8 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return BitBlend;
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
V2, Mask, DAG))
if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
@ -13534,7 +13540,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
DL, MVT::v16i8, V1, V2, Mask, DAG))
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.

View File

@ -2488,17 +2488,19 @@ define <8 x i16> @shuffle_v8i16_9zzzuuuu(<8 x i16> %x) {
;
; SSSE3-LABEL: shuffle_v8i16_9zzzuuuu:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_9zzzuuuu:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_9zzzuuuu:
; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%r = shufflevector <8 x i16> zeroinitializer, <8 x i16> %x, <8 x i32> <i32 9, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %r