diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d2c410240da..26e0e2ea26b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5133,30 +5133,38 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. -/// Sets IsUnary to true if only uses one source. +/// target specific opcode. Returns true if the Mask could be calculated. Sets +/// IsUnary to true if only uses one source. Note that this will set IsUnary for +/// shuffles which use a single input multiple times, and in those cases it will +/// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; + bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); @@ -5210,6 +5218,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, default: llvm_unreachable("unknown target shuffle node"); } + // If we have a fake unary shuffle, the shuffle mask is spread across two + // inputs that are actually the same node. Re-map the mask to always point + // into the first input. + if (IsFakeUnary) + for (int &M : Mask) + if (M >= (int)Mask.size()) + M -= Mask.size(); + return true; } @@ -18735,6 +18751,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, bool Lo = Mask.equals(0, 0); unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS) : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH); + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64; Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); @@ -18757,16 +18775,18 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15))) { bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! MVT ShuffleVT; switch (Mask.size()) { case 4: ShuffleVT = MVT::v4i32; break; - case 8: ShuffleVT = MVT::v8i32; break; - case 16: ShuffleVT = MVT::v16i32; break; + case 8: ShuffleVT = MVT::v8i16; break; + case 16: ShuffleVT = MVT::v16i8; break; }; Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); DCI.AddToWorklist(Op.getNode()); - Op = DAG.getNode(Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, ShuffleVT, Op, - Op); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index 6d9b11792c7..ca540226ee7 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -72,9 +72,9 @@ entry: ret <4 x i64> %shuffle } -; CHECK: movlhps +; CHECK: vpunpcklqdq ; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: movlhps +; CHECK-NEXT: vpunpcklqdq ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 8c1b7b63850..3856aeac3fd 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -1,9 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; CHECK: vpunpcklbw %xmm -; CHECK-NEXT: vpunpckhbw %xmm -; CHECK-NEXT: vpshufd $85 +; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; CHECK-NEXT: vinsertf128 $1 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { entry: @@ -21,7 +19,7 @@ entry: } ; CHECK: vmovq -; CHECK-NEXT: vmovlhps %xmm +; CHECK-NEXT: vpunpcklqdq %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll index a18f7519563..f4539c8969c 100644 --- a/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -93,10 +93,10 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> % ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg -; ExeDepsFix works top down, thus it coalesces vmovlhps domain with -; vandps and there is nothing more you can do to match vmaxpd. -; CHECK: vmovlhps -; CHECK: vandps +; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with +; vpand and there is nothing more you can do to match vmaxpd. +; CHECK: vpunpcklqdq +; CHECK: vpand ; CHECK: vmaxpd ; CHECK: ret define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) { diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll index 754cbf41867..3ace250380d 100644 --- a/test/CodeGen/X86/vec_splat-3.ll +++ b/test/CodeGen/X86/vec_splat-3.ll @@ -75,9 +75,8 @@ define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_8: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $0 +; CHECK: pxor %[[X:xmm[0-9]+]], %[[X]] +; CHECK-NEXT: pshufb %[[X]], %xmm0 } define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -85,9 +84,7 @@ define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_9: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $85 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] } define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -95,9 +92,7 @@ define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_10: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-86 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] } define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -105,9 +100,7 @@ define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_11: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-1 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] } @@ -124,9 +117,7 @@ define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_13: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $85 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] } define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -134,9 +125,7 @@ define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_14: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-86 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] } define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -144,9 +133,7 @@ define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_15: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-1 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] } define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -154,9 +141,7 @@ define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_16: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $0 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] } define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -164,9 +149,7 @@ define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_17: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $85 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] } define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -174,9 +157,7 @@ define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_18: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-86 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] } define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -184,9 +165,7 @@ define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_19: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-1 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] } define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -194,9 +173,7 @@ define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_20: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $0 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] } define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -204,9 +181,7 @@ define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_21: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $85 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] } define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -214,9 +189,7 @@ define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_22: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-86 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] } define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { @@ -224,7 +197,5 @@ define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { ret <16 x i8> %tmp6 ; CHECK-LABEL: shuf_16i8_23: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-1 +; CHECK: pshufb {{.*}} # xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] }