[x86] Add a generic unpack-targeted lowering technique. This can be used

to generically lower blends and is particularly nice because it is
available frome SSE2 onward. This removes a lot of the remaining domain
crossing blends in SSE2 code.

I'm hoping to replace some of the "interleaved" lowering hacks with
something closer to this which should be more principled. First, this
needs to learn how to detect and use other interleavings besides that of
the natural type provided. That will be a follow-up patch though.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229378 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2015-02-16 12:28:18 +00:00
parent e62fbca6b7
commit cbe6ecfc81
6 changed files with 246 additions and 169 deletions

View File

@ -8463,6 +8463,50 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
DAG.getConstant(InsertPSMask, MVT::i8));
}
/// \brief Try to lower a shuffle as a permute of the inputs followed by an
/// UNPCK instruction.
///
/// This specifically targets cases where we end up with alternating between
/// the two inputs, and so can permute them into something that feeds a single
/// UNPCK instruction.
static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!isSingleInputShuffleMask(Mask) &&
"This routine should only be used when blending two inputs.");
assert(Mask.size() >= 2 && "Single element masks are invalid.");
int Size = Mask.size();
int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
return M >= 0 && M % Size < Size / 2;
});
int NumHiInputs = std::count_if(
Mask.begin(), Mask.end(), [Size](int M) { return M % Size > Size / 2; });
bool UnpackLo = NumLoInputs >= NumHiInputs;
SmallVector<int, 32> V1Mask(Mask.size(), -1);
SmallVector<int, 32> V2Mask(Mask.size(), -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
continue;
// We only handle the case where V1 feeds even mask slots and V2 feeds odd
// mask slots. We rely on canonicalization to ensure this is the case.
if ((i % 2 == 0) != (Mask[i] < Size))
return SDValue();
SmallVectorImpl<int> &VMask = (i % 2 == 0) ? V1Mask : V2Mask;
VMask[i / 2 + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size;
}
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V1,
V2);
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
@ -8921,6 +8965,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
Mask, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack =
lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
return Unpack;
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
@ -9670,6 +9719,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
Mask, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack =
lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
return Unpack;
int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

View File

@ -7,9 +7,10 @@ define <4 x i32> @a(<4 x i32> %i) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: a:
@ -46,10 +47,11 @@ define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: c:
@ -91,13 +93,14 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; SSE2-NEXT: callq foo
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: addq $40, %rsp
; SSE2-NEXT: retq
;

View File

@ -315,10 +315,11 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
; CHECK: ## BB#0:
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm2, %xmm1
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: retl
%m = mul <4 x i32> %x, %y
ret <4 x i32> %m

View File

@ -25,11 +25,12 @@ define <4 x i32> @test1(<4 x i32> %a) {
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm1, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: psrld $1, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
@ -85,20 +86,22 @@ define <8 x i32> @test2(<8 x i32> %a) {
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm5
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSE-NEXT: psubd %xmm3, %xmm0
; SSE-NEXT: psrld $1, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm0
; SSE-NEXT: psrld $2, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
@ -838,21 +841,22 @@ define <4 x i32> @test8(<4 x i32> %a) {
;
; SSE-LABEL: test8:
; SSE: # BB#0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psrad $31, %xmm3
; SSE-NEXT: pand %xmm2, %xmm3
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pmuludq %xmm2, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: paddd %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: psubd %xmm3, %xmm1
; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm1, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: psrld $31, %xmm0
@ -911,46 +915,48 @@ define <8 x i32> @test9(<8 x i32> %a) {
;
; SSE-LABEL: test9:
; SSE: # BB#0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrad $31, %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm5
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm5
; SSE-NEXT: psrad $31, %xmm5
; SSE-NEXT: pand %xmm1, %xmm5
; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: paddd %xmm0, %xmm5
; SSE-NEXT: movdqa %xmm3, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pmuludq %xmm3, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE-NEXT: pmuludq %xmm6, %xmm7
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
; SSE-NEXT: psubd %xmm5, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: psrld $31, %xmm3
; SSE-NEXT: psrad $2, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm0
; SSE-NEXT: pand %xmm2, %xmm4
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psrad $31, %xmm3
; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: paddd %xmm4, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE-NEXT: pmuludq %xmm6, %xmm4
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: psubd %xmm3, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $31, %xmm2
; SSE-NEXT: psrad $2, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
; SSE-NEXT: psrad $2, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm4
; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: psrad $31, %xmm5
; SSE-NEXT: pand %xmm3, %xmm5
; SSE-NEXT: paddd %xmm4, %xmm5
; SSE-NEXT: pmuludq %xmm1, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm6, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm5, %xmm2
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: psrld $31, %xmm1
; SSE-NEXT: psrad $2, %xmm2
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test9:
@ -1006,41 +1012,45 @@ define <8 x i32> @test10(<8 x i32> %a) {
;
; SSE-LABEL: test10:
; SSE: # BB#0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm5
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: psubd %xmm3, %xmm5
; SSE-NEXT: psubd %xmm2, %xmm5
; SSE-NEXT: psrld $1, %xmm5
; SSE-NEXT: paddd %xmm3, %xmm5
; SSE-NEXT: paddd %xmm2, %xmm5
; SSE-NEXT: psrld $2, %xmm5
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7]
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
; SSE-NEXT: pmuludq %xmm3, %xmm5
; SSE-NEXT: pmuludq %xmm3, %xmm6
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3]
; SSE-NEXT: pmuludq %xmm2, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
; SSE-NEXT: pmuludq %xmm2, %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE-NEXT: psubd %xmm5, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm5
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: psubd %xmm2, %xmm4
; SSE-NEXT: psubd %xmm3, %xmm4
; SSE-NEXT: psrld $1, %xmm4
; SSE-NEXT: paddd %xmm2, %xmm4
; SSE-NEXT: paddd %xmm3, %xmm4
; SSE-NEXT: psrld $2, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE-NEXT: pmuludq %xmm3, %xmm4
; SSE-NEXT: pmuludq %xmm3, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE-NEXT: pmuludq %xmm2, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: pmuludq %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; SSE-NEXT: psubd %xmm4, %xmm1
; SSE-NEXT: retq
;
@ -1109,13 +1119,14 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: psrad $31, %xmm6
; SSE-NEXT: pand %xmm2, %xmm6
; SSE-NEXT: paddd %xmm4, %xmm6
; SSE-NEXT: movdqa %xmm0, %xmm7
; SSE-NEXT: pmuludq %xmm2, %xmm7
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: pmuludq %xmm2, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm5, %xmm4
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
; SSE-NEXT: psubd %xmm6, %xmm7
; SSE-NEXT: paddd %xmm0, %xmm7
; SSE-NEXT: movdqa %xmm7, %xmm4
@ -1125,9 +1136,10 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
; SSE-NEXT: pmuludq %xmm4, %xmm6
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
; SSE-NEXT: psubd %xmm7, %xmm0
; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: movdqa %xmm1, %xmm6
@ -1135,10 +1147,11 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: pand %xmm2, %xmm6
; SSE-NEXT: paddd %xmm3, %xmm6
; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm5, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm6, %xmm2
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm3
@ -1147,9 +1160,10 @@ define <8 x i32> @test11(<8 x i32> %a) {
; SSE-NEXT: paddd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE-NEXT: pmuludq %xmm4, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: retq
;
@ -1222,15 +1236,15 @@ define <4 x i32> @PR20355(<4 x i32> %a) {
; SSE-NEXT: paddd %xmm2, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm2, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: psubd %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $31, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
; SSE-NEXT: psubd %xmm3, %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR20355:

View File

@ -549,28 +549,27 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
; SSE2: # BB#0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,0,4,5,6,7]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,0,4,5,6,7]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@ -580,9 +579,9 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]

View File

@ -275,16 +275,18 @@ define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test1b:
; SSE2: # BB#0:
; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test1b:
; SSSE3: # BB#0:
; SSSE3-NEXT: andps %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test1b:
@ -313,16 +315,18 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test2b:
; SSE2: # BB#0:
; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test2b:
; SSSE3: # BB#0:
; SSSE3-NEXT: orps %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test2b:
@ -390,18 +394,18 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test4b:
; SSE2: # BB#0:
; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test4b:
; SSSE3: # BB#0:
; SSSE3-NEXT: andps %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test4b:
@ -430,18 +434,18 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test5b:
; SSE2: # BB#0:
; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test5b:
; SSSE3: # BB#0:
; SSSE3-NEXT: orps %xmm1, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test5b:
@ -1012,14 +1016,16 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test16:
; SSE2: # BB#0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test16:
; SSSE3: # BB#0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test16:
@ -1171,16 +1177,16 @@ define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test21:
; SSE2: # BB#0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test21:
; SSSE3: # BB#0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test21: