From f91161874e516621aa9c1734391c5bd01d9d4e6d Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 27 Jun 2014 11:27:52 +0000 Subject: [PATCH] [x86] Teach the X86 backend to DAG-combine SSE2 shuffles that are trivially redundant. This fixes several cases in the new vector shuffle lowering algorithm which would generate redundant shuffle instructions for the sake of simplicity. I'm also deleting a testcase which was somewhat ridiculous. It was checking for a bug in 2007 about incorrectly transforming shuffles by looking for the string "-86" in the output of a pretty substantial function. This test case doesn't seem to have any value at this point. Differential Revision: http://reviews.llvm.org/D4240 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211889 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 102 +++++++++++++++++- .../CodeGen/X86/2007-09-18-ShuffleXformBug.ll | 30 ------ test/CodeGen/X86/vector-shuffle-128-v8.ll | 19 ++-- 3 files changed, 108 insertions(+), 43 deletions(-) delete mode 100644 test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3ceeac3b34e..2e23cd3674a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19034,6 +19034,95 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector getPSHUFShuffleMask(SDValue N) { + SmallVector Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + default: + return SDValue(); + } + + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT == MVT::v8i16); + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. + if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && + areAdjacentMasksSequential(Mask)) { + int DMask[] = {-1, -1, -1, -1}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + Mask[0] / 2; + DMask[DOffset + 1] = DOffset + Mask[2] / 2; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + getV4X86ShuffleImm8ForMask(DMask, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + } + + // Fallthrough + case X86ISD::PSHUFD: + if (V.getOpcode() == N.getOpcode()) { + // If we have two sequential shuffles of the same kind we can always fold + // them. Even if there are multiple uses, this is beneficial because it + // breaks a dependency. + SmallVector VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + break; + } + + return SDValue(); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -19158,7 +19247,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + if (LD.getNode()) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to diff --git a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll b/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll deleted file mode 100644 index 0ae1897e60e..00000000000 --- a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep -- -86 - -define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind { -entry: - alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:0 [#uses=167] - alloca [4 x <4 x float>] ; <[4 x <4 x float>]*>:1 [#uses=170] - alloca [4 x <4 x i32>] ; <[4 x <4 x i32>]*>:2 [#uses=12] - %.sub6235.i = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0 ; <<4 x float>*> [#uses=76] - %.sub.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0 ; <<4 x float>*> [#uses=59] - - %tmp124.i1062.i = getelementptr <4 x float>* %tmp116117.i1061.i, i32 63 ; <<4 x float>*> [#uses=1] - %tmp125.i1063.i = load <4 x float>* %tmp124.i1062.i ; <<4 x float>> [#uses=5] - %tmp828.i1077.i = shufflevector <4 x float> %tmp125.i1063.i, <4 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ; <<4 x float>> [#uses=4] - %tmp704.i1085.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1] - %tmp712.i1086.i = call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %tmp704.i1085.i, <4 x float> %tmp828.i1077.i ) ; <<4 x float>> [#uses=1] - store <4 x float> %tmp712.i1086.i, <4 x float>* %.sub.i - - %tmp2587.i1145.gep.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0, i32 2 ; [#uses=1] - %tmp5334.i = load float* %tmp2587.i1145.gep.i ; [#uses=5] - %tmp2723.i1170.i = insertelement <4 x float> undef, float %tmp5334.i, i32 2 ; <<4 x float>> [#uses=5] - store <4 x float> %tmp2723.i1170.i, <4 x float>* %.sub6235.i - - %tmp1406.i1367.i = shufflevector <4 x float> %tmp2723.i1170.i, <4 x float> undef, <4 x i32> < i32 2, i32 2, i32 2, i32 2 > ; <<4 x float>> [#uses=1] - %tmp84.i1413.i = load <4 x float>* %.sub6235.i ; <<4 x float>> [#uses=1] - %tmp89.i1415.i = fmul <4 x float> %tmp84.i1413.i, %tmp1406.i1367.i ; <<4 x float>> [#uses=1] - store <4 x float> %tmp89.i1415.i, <4 x float>* %.sub.i - ret i16 0 -} - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 7447fdc8747..091822b6cbb 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -51,7 +51,7 @@ define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745 ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -159,7 +159,7 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -273,8 +273,7 @@ define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,3,0,1,4,5,6,7] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -285,8 +284,7 @@ define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -297,8 +295,7 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,3,0,1,4,5,6,7] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> @@ -359,9 +356,8 @@ define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879 ; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[2,3,0,1,4,5,6,7] ; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> @@ -458,8 +454,7 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,4,5] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle