diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 98ae4db7ea2..6a962fd69e0 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10801,6 +10801,27 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops); } + // Type legalization might introduce new shuffles in the DAG. + // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) + // -> (shuffle (VBinOp (A, B)), Undef, Mask). + if (LegalTypes && isa(LHS) && + isa(RHS) && LHS.hasOneUse() && RHS.hasOneUse() && + LHS.getOperand(1).getOpcode() == ISD::UNDEF && + RHS.getOperand(1).getOpcode() == ISD::UNDEF) { + ShuffleVectorSDNode *SVN0 = cast(LHS); + ShuffleVectorSDNode *SVN1 = cast(RHS); + + if (SVN0->getMask().equals(SVN1->getMask())) { + EVT VT = N->getValueType(0); + SDValue UndefVector = LHS.getOperand(1); + SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, + LHS.getOperand(0), RHS.getOperand(0)); + AddUsersToWorkList(N); + return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, + &SVN0->getMask()[0]); + } + } + return SDValue(); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index edc43a58223..9e6fd90cd16 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17495,6 +17495,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc dl(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // Don't create instructions with illegal types after legalize types has run. @@ -17507,6 +17509,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); + // During Type Legalization, when promoting illegal vector types, + // the backend might introduce new shuffle dag nodes and bitcasts. + // + // This code performs the following transformation: + // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) -> + // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, ) + // + // We do this only if both the bitcast and the BINOP dag nodes have + // one use. Also, perform this transformation only if the new binary + // operation is legal. This is to avoid introducing dag nodes that + // potentially need to be further expanded (or custom lowered) into a + // less optimal sequence of dag nodes. + if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && + N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && + N0.getOpcode() == ISD::BITCAST) { + SDValue BC0 = N0.getOperand(0); + EVT SVT = BC0.getValueType(); + unsigned Opcode = BC0.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); + + if (BC0.hasOneUse() && SVT.isVector() && + SVT.getVectorNumElements() * 2 == NumElts && + TLI.isOperationLegal(Opcode, VT)) { + bool CanFold = false; + switch (Opcode) { + default : break; + case ISD::ADD : + case ISD::FADD : + case ISD::SUB : + case ISD::FSUB : + case ISD::MUL : + case ISD::FMUL : + CanFold = true; + } + + unsigned SVTNumElts = SVT.getVectorNumElements(); + ShuffleVectorSDNode *SVOp = cast(N); + for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) == (int)(i * 2); + for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) < 0; + + if (CanFold) { + SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); + SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); + } + } + } + // Only handle 128 wide vector from here on. if (!VT.is128BitVector()) return SDValue(); diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll new file mode 100644 index 00000000000..8440fdab0ee --- /dev/null +++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll @@ -0,0 +1,273 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=AVX + + +define double @test1_add(double %A, double %B) { + %1 = bitcast double %A to <2 x i32> + %2 = bitcast double %B to <2 x i32> + %add = add <2 x i32> %1, %2 + %3 = bitcast <2 x i32> %add to double + ret double %3 +} +; CHECK-LABEL: test1_add +; SSE41: paddd +; AVX: vpaddd +; CHECK-NEXT: ret + + +define double @test2_add(double %A, double %B) { + %1 = bitcast double %A to <4 x i16> + %2 = bitcast double %B to <4 x i16> + %add = add <4 x i16> %1, %2 + %3 = bitcast <4 x i16> %add to double + ret double %3 +} +; CHECK-LABEL: test2_add +; SSE41: paddw +; AVX: vpaddw +; CHECK-NEXT: ret + +define double @test3_add(double %A, double %B) { + %1 = bitcast double %A to <8 x i8> + %2 = bitcast double %B to <8 x i8> + %add = add <8 x i8> %1, %2 + %3 = bitcast <8 x i8> %add to double + ret double %3 +} +; CHECK-LABEL: test3_add +; SSE41: paddb +; AVX: vpaddb +; CHECK-NEXT: ret + + +define double @test1_sub(double %A, double %B) { + %1 = bitcast double %A to <2 x i32> + %2 = bitcast double %B to <2 x i32> + %sub = sub <2 x i32> %1, %2 + %3 = bitcast <2 x i32> %sub to double + ret double %3 +} +; CHECK-LABEL: test1_sub +; SSE41: psubd +; AVX: vpsubd +; CHECK-NEXT: ret + + +define double @test2_sub(double %A, double %B) { + %1 = bitcast double %A to <4 x i16> + %2 = bitcast double %B to <4 x i16> + %sub = sub <4 x i16> %1, %2 + %3 = bitcast <4 x i16> %sub to double + ret double %3 +} +; CHECK-LABEL: test2_sub +; SSE41: psubw +; AVX: vpsubw +; CHECK-NEXT: ret + + +define double @test3_sub(double %A, double %B) { + %1 = bitcast double %A to <8 x i8> + %2 = bitcast double %B to <8 x i8> + %sub = sub <8 x i8> %1, %2 + %3 = bitcast <8 x i8> %sub to double + ret double %3 +} +; CHECK-LABEL: test3_sub +; SSE41: psubb +; AVX: vpsubb +; CHECK-NEXT: ret + + +define double @test1_mul(double %A, double %B) { + %1 = bitcast double %A to <2 x i32> + %2 = bitcast double %B to <2 x i32> + %mul = mul <2 x i32> %1, %2 + %3 = bitcast <2 x i32> %mul to double + ret double %3 +} +; CHECK-LABEL: test1_mul +; SSE41: pmulld +; AVX: vpmulld +; CHECK-NEXT: ret + + +define double @test2_mul(double %A, double %B) { + %1 = bitcast double %A to <4 x i16> + %2 = bitcast double %B to <4 x i16> + %mul = mul <4 x i16> %1, %2 + %3 = bitcast <4 x i16> %mul to double + ret double %3 +} +; CHECK-LABEL: test2_mul +; SSE41: pmullw +; AVX: vpmullw +; CHECK-NEXT: ret + +; There is no legal ISD::MUL with type MVT::v8i16. +define double @test3_mul(double %A, double %B) { + %1 = bitcast double %A to <8 x i8> + %2 = bitcast double %B to <8 x i8> + %mul = mul <8 x i8> %1, %2 + %3 = bitcast <8 x i8> %mul to double + ret double %3 +} +; CHECK-LABEL: test3_mul +; CHECK: pmullw +; CHECK-NEXT: pshufb +; CHECK-NEXT: ret + + +define double @test1_and(double %A, double %B) { + %1 = bitcast double %A to <2 x i32> + %2 = bitcast double %B to <2 x i32> + %and = and <2 x i32> %1, %2 + %3 = bitcast <2 x i32> %and to double + ret double %3 +} +; CHECK-LABEL: test1_and +; SSE41: andps +; AVX: vandps +; CHECK-NEXT: ret + + +define double @test2_and(double %A, double %B) { + %1 = bitcast double %A to <4 x i16> + %2 = bitcast double %B to <4 x i16> + %and = and <4 x i16> %1, %2 + %3 = bitcast <4 x i16> %and to double + ret double %3 +} +; CHECK-LABEL: test2_and +; SSE41: andps +; AVX: vandps +; CHECK-NEXT: ret + + +define double @test3_and(double %A, double %B) { + %1 = bitcast double %A to <8 x i8> + %2 = bitcast double %B to <8 x i8> + %and = and <8 x i8> %1, %2 + %3 = bitcast <8 x i8> %and to double + ret double %3 +} +; CHECK-LABEL: test3_and +; SSE41: andps +; AVX: vandps +; CHECK-NEXT: ret + + +define double @test1_or(double %A, double %B) { + %1 = bitcast double %A to <2 x i32> + %2 = bitcast double %B to <2 x i32> + %or = or <2 x i32> %1, %2 + %3 = bitcast <2 x i32> %or to double + ret double %3 +} +; CHECK-LABEL: test1_or +; SSE41: orps +; AVX: vorps +; CHECK-NEXT: ret + + +define double @test2_or(double %A, double %B) { + %1 = bitcast double %A to <4 x i16> + %2 = bitcast double %B to <4 x i16> + %or = or <4 x i16> %1, %2 + %3 = bitcast <4 x i16> %or to double + ret double %3 +} +; CHECK-LABEL: test2_or +; SSE41: orps +; AVX: vorps +; CHECK-NEXT: ret + + +define double @test3_or(double %A, double %B) { + %1 = bitcast double %A to <8 x i8> + %2 = bitcast double %B to <8 x i8> + %or = or <8 x i8> %1, %2 + %3 = bitcast <8 x i8> %or to double + ret double %3 +} +; CHECK-LABEL: test3_or +; SSE41: orps +; AVX: vorps +; CHECK-NEXT: ret + + +define double @test1_xor(double %A, double %B) { + %1 = bitcast double %A to <2 x i32> + %2 = bitcast double %B to <2 x i32> + %xor = xor <2 x i32> %1, %2 + %3 = bitcast <2 x i32> %xor to double + ret double %3 +} +; CHECK-LABEL: test1_xor +; SSE41: xorps +; AVX: vxorps +; CHECK-NEXT: ret + + +define double @test2_xor(double %A, double %B) { + %1 = bitcast double %A to <4 x i16> + %2 = bitcast double %B to <4 x i16> + %xor = xor <4 x i16> %1, %2 + %3 = bitcast <4 x i16> %xor to double + ret double %3 +} +; CHECK-LABEL: test2_xor +; SSE41: xorps +; AVX: vxorps +; CHECK-NEXT: ret + + +define double @test3_xor(double %A, double %B) { + %1 = bitcast double %A to <8 x i8> + %2 = bitcast double %B to <8 x i8> + %xor = xor <8 x i8> %1, %2 + %3 = bitcast <8 x i8> %xor to double + ret double %3 +} +; CHECK-LABEL: test3_xor +; SSE41: xorps +; AVX: vxorps +; CHECK-NEXT: ret + + +define double @test_fadd(double %A, double %B) { + %1 = bitcast double %A to <2 x float> + %2 = bitcast double %B to <2 x float> + %add = fadd <2 x float> %1, %2 + %3 = bitcast <2 x float> %add to double + ret double %3 +} +; CHECK-LABEL: test_fadd +; SSE41: addps +; AVX: vaddps +; CHECK-NEXT: ret + +define double @test_fsub(double %A, double %B) { + %1 = bitcast double %A to <2 x float> + %2 = bitcast double %B to <2 x float> + %sub = fsub <2 x float> %1, %2 + %3 = bitcast <2 x float> %sub to double + ret double %3 +} +; CHECK-LABEL: test_fsub +; SSE41: subps +; AVX: vsubps +; CHECK-NEXT: ret + +define double @test_fmul(double %A, double %B) { + %1 = bitcast double %A to <2 x float> + %2 = bitcast double %B to <2 x float> + %mul = fmul <2 x float> %1, %2 + %3 = bitcast <2 x float> %mul to double + ret double %3 +} +; CHECK-LABEL: test_fmul +; SSE41: mulps +; AVX: vmulps +; CHECK-NEXT: ret + diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index b9b29a558e2..769831ee818 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -14,7 +14,7 @@ define double @test1(double %A) { ; CHECK-LABEL: test1 ; CHECK-NOT: movsd ; CHECK: pshufd -; CHECK-NEXT: paddq +; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK-NEXT: ret @@ -26,16 +26,9 @@ define double @test2(double %A, double %B) { %3 = bitcast <2 x i32> %add to double ret double %3 } -; FIXME: Ideally we should be able to fold the entire body of @test2 into a -; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the -; sequence pshufd+pshufd+paddq+pshufd. - ; CHECK-LABEL: test2 ; CHECK-NOT: movsd -; CHECK: pshufd -; CHECK-NEXT: pshufd -; CHECK-NEXT: paddq -; CHECK-NEXT: pshufd +; CHECK: paddd ; CHECK-NEXT: ret @@ -91,7 +84,7 @@ define double @test6(double %A) { ; CHECK-LABEL: test6 ; CHECK-NOT: movsd ; CHECK: punpcklwd -; CHECK-NEXT: paddd +; CHECK-NEXT: paddw ; CHECK-NEXT: pshufb ; CHECK-NEXT: ret @@ -103,16 +96,10 @@ define double @test7(double %A, double %B) { %3 = bitcast <4 x i16> %add to double ret double %3 } -; FIXME: Ideally we should be able to fold the entire body of @test7 into a -; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the -; sequence pshufd+pshufd+paddd+pshufd. - ; CHECK-LABEL: test7 ; CHECK-NOT: movsd -; CHECK: punpcklwd -; CHECK-NEXT: punpcklwd -; CHECK-NEXT: paddd -; CHECK-NEXT: pshufb +; CHECK-NOT: punpcklwd +; CHECK: paddw ; CHECK-NEXT: ret @@ -129,7 +116,7 @@ define double @test8(double %A) { ; CHECK-LABEL: test8 ; CHECK-NOT: movsd ; CHECK: punpcklbw -; CHECK-NEXT: paddw +; CHECK-NEXT: paddb ; CHECK-NEXT: pshufb ; CHECK-NEXT: ret @@ -141,15 +128,9 @@ define double @test9(double %A, double %B) { %3 = bitcast <8 x i8> %add to double ret double %3 } -; FIXME: Ideally we should be able to fold the entire body of @test9 into a -; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the -; sequence pshufd+pshufd+paddw+pshufd. - ; CHECK-LABEL: test9 ; CHECK-NOT: movsd -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: paddw -; CHECK-NEXT: pshufb +; CHECK-NOT: punpcklbw +; CHECK: paddb ; CHECK-NEXT: ret