From 9320a1273e61a57ff25ddab7fdd48a29959e8ede Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 14 Sep 2016 14:08:18 +0000 Subject: [PATCH] [X86][SSE] Don't blend vector shifts with MOVSS/MOVSD directly, lower from generic shuffle Shuffle lowering will correctly lower to MOVSS/MOVSD/PBLEND, improving commutation opportunities git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@281471 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 20 +++---- test/CodeGen/X86/lower-vec-shift.ll | 92 +++++++++++++++++++---------- 2 files changed, 70 insertions(+), 42 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dc7617d7b98..6e4c84c1938 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -20458,7 +20458,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } // If possible, lower this shift as a sequence of two shifts by - // constant plus a MOVSS/MOVSD instead of scalarizing it. + // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // @@ -20478,7 +20478,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); // See if it is possible to replace this node with a sequence of - // two shifts followed by a MOVSS/MOVSD + // two shifts followed by a MOVSS/MOVSD/PBLEND. if (VT == MVT::v4i32) { // Check if it is legal to use a MOVSS. CanBeSimplified = Amt2 == Amt->getOperand(2) && @@ -20510,21 +20510,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, if (CanBeSimplified && isa(Amt1) && isa(Amt2)) { - // Replace this node with two shifts followed by a MOVSS/MOVSD. + // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND. MVT CastVT = MVT::v4i32; SDValue Splat1 = - DAG.getConstant(cast(Amt1)->getAPIntValue(), dl, VT); + DAG.getConstant(cast(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = - DAG.getConstant(cast(Amt2)->getAPIntValue(), dl, VT); + DAG.getConstant(cast(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); - if (TargetOpcode == X86ISD::MOVSD) - CastVT = MVT::v2i64; SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); - SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, - BitCast1, DAG); - return DAG.getBitcast(VT, Result); + if (TargetOpcode == X86ISD::MOVSD) + return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1, + BitCast2, {0, 1, 6, 7})); + return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1, + BitCast2, {0, 5, 6, 7})); } } diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll index edbd14a0176..783cda0a8dd 100644 --- a/test/CodeGen/X86/lower-vec-shift.ll +++ b/test/CodeGen/X86/lower-vec-shift.ll @@ -19,12 +19,19 @@ define <8 x i16> @test1(<8 x i16> %a) { ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test1: -; AVX: # BB#0: -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test1: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq %lshr = lshr <8 x i16> %a, ret <8 x i16> %lshr } @@ -39,12 +46,19 @@ define <8 x i16> @test2(<8 x i16> %a) { ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test2: -; AVX: # BB#0: -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: test2: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %lshr = lshr <8 x i16> %a, ret <8 x i16> %lshr } @@ -61,9 +75,9 @@ define <4 x i32> @test3(<4 x i32> %a) { ; ; AVX1-LABEL: test3: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0 -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: test3: @@ -88,7 +102,7 @@ define <4 x i32> @test4(<4 x i32> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0 -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: test4: @@ -109,12 +123,19 @@ define <8 x i16> @test5(<8 x i16> %a) { ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test5: -; AVX: # BB#0: -; AVX-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test5: +; AVX1: # BB#0: +; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test5: +; AVX2: # BB#0: +; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq %lshr = ashr <8 x i16> %a, ret <8 x i16> %lshr } @@ -129,12 +150,19 @@ define <8 x i16> @test6(<8 x i16> %a) { ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test6: -; AVX: # BB#0: -; AVX-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: test6: +; AVX1: # BB#0: +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test6: +; AVX2: # BB#0: +; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %lshr = ashr <8 x i16> %a, ret <8 x i16> %lshr } @@ -151,9 +179,9 @@ define <4 x i32> @test7(<4 x i32> %a) { ; ; AVX1-LABEL: test7: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0 -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: test7: @@ -178,7 +206,7 @@ define <4 x i32> @test8(<4 x i32> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1 ; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0 -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: test8: