From 9b6e2452c34fe3e93eff9bc50bfe6d87b2dcb5af Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 3 Aug 2016 19:08:24 +0000 Subject: [PATCH] [X86][SSE] Enable target shuffle combining to combine multiple shuffle inputs. We currently only support combining target shuffles that consist of a single source input (plus elements known to be undef/zero). This patch generalizes the recursive combining of the target shuffle to collect all the inputs, merging any duplicates along the way, into a full set of src ops and its shuffle mask. We uncover a number of cases where we have failed to combine a unary shuffle because the input has been duplicated and separated during lowering. This will allow us to combine to 2-input shuffles in a future patch. Differential Revision: https://reviews.llvm.org/D22859 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277631 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 106 +++++++++++++----- test/CodeGen/X86/avx512-build-vector.ll | 11 +- .../X86/clear_upper_vector_element_bits.ll | 16 +-- test/CodeGen/X86/vector-shuffle-256-v16.ll | 12 +- test/CodeGen/X86/vector-shuffle-256-v32.ll | 24 ++-- test/CodeGen/X86/vector-shuffle-combining.ll | 3 - 6 files changed, 97 insertions(+), 75 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4cf2639a481..d32fe19a363 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25289,7 +25289,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with -/// a suitable short sequence of other instructions. The PHUFB will either +/// a suitable short sequence of other instructions. The PSHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// @@ -25302,7 +25302,8 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. -static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, +static bool combineX86ShufflesRecursively(ArrayRef SrcOps, + int SrcOpIndex, SDValue Root, ArrayRef RootMask, int Depth, bool HasVariableMask, SelectionDAG &DAG, @@ -25314,6 +25315,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; // Directly rip through bitcasts to find the underlying operand. + SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); MVT VT = Op.getSimpleValueType(); @@ -25331,6 +25333,27 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) return false; + // Add the inputs to the Ops list, avoiding duplicates. + SmallVector Ops(SrcOps.begin(), SrcOps.end()); + + int InputIdx0 = -1, InputIdx1 = -1; + for (int i = 0, e = Ops.size(); i < e; ++i) { + SDValue BC = peekThroughBitcasts(Ops[i]); + if (Input0 && BC == peekThroughBitcasts(Input0)) + InputIdx0 = i; + if (Input1 && BC == peekThroughBitcasts(Input1)) + InputIdx1 = i; + } + + if (Input0 && InputIdx0 < 0) { + InputIdx0 = SrcOpIndex; + Ops[SrcOpIndex] = Input0; + } + if (Input1 && InputIdx1 < 0) { + InputIdx1 = Ops.size(); + Ops.push_back(Input1); + } + assert(VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"); assert(((RootMask.size() > OpMask.size() && @@ -25362,6 +25385,17 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, } int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + + // Just insert the scaled root mask value if it references an input other + // than the SrcOp we're currently inserting. + if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || + (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { + Mask.push_back(RootMaskedIdx); + continue; + } + + RootMaskedIdx %= MaskWidth; + int OpIdx = RootMaskedIdx / OpRatio; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we @@ -25370,9 +25404,19 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, continue; } - // Ok, we have non-zero lanes, map them through. - Mask.push_back(OpMask[OpIdx] * OpRatio + - RootMaskedIdx % OpRatio); + // Ok, we have non-zero lanes, map them through to one of the Op's inputs. + int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio; + OpMaskedIdx %= MaskWidth; + + if (OpMask[OpIdx] < (int)OpMask.size()) { + assert(0 <= InputIdx0 && "Unknown target shuffle input"); + OpMaskedIdx += InputIdx0 * MaskWidth; + } else { + assert(0 <= InputIdx1 && "Unknown target shuffle input"); + OpMaskedIdx += InputIdx1 * MaskWidth; + } + + Mask.push_back(OpMaskedIdx); } // Handle the all undef/zero cases early. @@ -25389,29 +25433,35 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return true; } - int MaskSize = Mask.size(); - bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), - [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); - bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), - [MaskSize](int Idx) { return MaskSize <= Idx; }); - - // At the moment we can only combine unary shuffle mask cases. - if (UseInput0 && UseInput1) - return false; - else if (UseInput1) { - std::swap(Input0, Input1); - ShuffleVectorSDNode::commuteMask(Mask); + // Remove unused shuffle source ops. + SmallVector UsedOps; + for (int i = 0, e = Ops.size(); i < e; ++i) { + int lo = UsedOps.size() * MaskWidth; + int hi = lo + MaskWidth; + if (std::any_of(Mask.begin(), Mask.end(), + [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + UsedOps.push_back(Ops[i]); + continue; + } + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; } - - assert(Input0 && "Shuffle with no inputs detected"); + assert(!UsedOps.empty() && "Shuffle with no inputs detected"); + Ops = UsedOps; HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into Input0 (if it's a target shuffle). - if (Op->isOnlyUserOf(Input0.getNode()) && - combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) - return true; + // See if we can recurse into each shuffle source op (if it's a target shuffle). + for (int i = 0, e = Ops.size(); i < e; ++i) + if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, + HasVariableMask, DAG, DCI, Subtarget)) + return true; + + // At the moment we can only combine unary shuffle mask cases. + if (Ops.size() != 1) + return false; // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of @@ -25423,7 +25473,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, Mask = std::move(WidenedMask); } - return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG, + return combineX86ShuffleChain(Ops[0], Root, Mask, Depth, HasVariableMask, DAG, DCI, Subtarget); } @@ -26099,8 +26149,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return LD; if (isTargetShuffle(N->getOpcode())) { - if (SDValue Shuffle = - combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget)) + SDValue Op(N, 0); + if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle @@ -26110,7 +26160,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // a particular chain. SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index 980b87187d9..40ee06b9fbb 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -14,13 +14,10 @@ define <16 x i32> @test2(<16 x i32> %x) { define <16 x float> @test3(<4 x float> %a) { ; CHECK-LABEL: test3: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index e05451b8027..c50654aae33 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -10,16 +10,12 @@ define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind { ; SSE-LABEL: _clearupper2xi64a: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper2xi64a: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <2 x i64> %0, i32 0 %x1 = extractelement <2 x i64> %0, i32 1 @@ -35,13 +31,7 @@ define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind { define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind { ; SSE-LABEL: _clearupper4xi32a: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: _clearupper4xi32a: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 2182ffe0983..dce66d8449e 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -247,8 +247,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -268,8 +267,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -289,8 +287,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -310,8 +307,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index b0566812ff7..788bea1e921 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -480,8 +480,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -502,8 +501,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -524,8 +522,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -546,8 +543,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -568,8 +564,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -590,8 +585,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -612,8 +606,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -636,8 +629,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: movl $15, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 266a3658eda..b0505192fe8 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1812,9 +1812,6 @@ define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) { ; ; AVX-LABEL: combine_test23: ; AVX: # BB#0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq