From bdcb5afb77547337ba148ce24d5e1046c0b25ced Mon Sep 17 00:00:00 2001
From: Nate Begeman <natebegeman@mac.com>
Date: Tue, 27 Jul 2010 22:37:06 +0000
Subject: [PATCH] ~40% faster vector shl <4 x i32> on SSE 4.1  Larger
 improvements for smaller types coming in future patches.

For:

define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
entry:
  %shl = shl <4 x i32> %r, %a                     ; <<4 x i32>> [#uses=1]
  %tmp2 = bitcast <4 x i32> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %tmp2
}

We get:

_shl:                                   ## @shl
	pslld	$23, %xmm1
	paddd	LCPI0_0, %xmm1
	cvttps2dq	%xmm1, %xmm1
	pmulld	%xmm1, %xmm0
	ret

Instead of:

_shl:                                   ## @shl
	pshufd	$3, %xmm0, %xmm2
	movd	%xmm2, %eax
	pshufd	$3, %xmm1, %xmm2
	movd	%xmm2, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm2
	pshufd	$1, %xmm0, %xmm3
	movd	%xmm3, %eax
	pshufd	$1, %xmm1, %xmm3
	movd	%xmm3, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm3
	punpckldq	%xmm2, %xmm3
	movd	%xmm0, %eax
	movd	%xmm1, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm2
	movhlps	%xmm0, %xmm0
	movd	%xmm0, %eax
	movhlps	%xmm1, %xmm1
	movd	%xmm1, %ecx
	shll	%cl, %eax
	movd	%eax, %xmm0
	punpckldq	%xmm0, %xmm2
	movdqa	%xmm2, %xmm0
	punpckldq	%xmm3, %xmm0
	ret


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@109549 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 33 ++++++++++++++++++++++++++++++
 lib/Target/X86/X86ISelLowering.h   |  1 +
 test/CodeGen/X86/vec_shift4.ll     | 14 +++++++++++++
 3 files changed, 48 insertions(+)
 create mode 100644 test/CodeGen/X86/vec_shift4.ll

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 618bb90d445..88bc8d0a92b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -838,6 +838,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
+    // Can turn SHL into an integer multiply.
+    setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
+
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
     // custom since the immediate controlling the insert encodes additional
@@ -7498,6 +7501,35 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
   return Res;
 }
 
+SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue R = Op.getOperand(0);
+
+  assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
+  assert(VT == MVT::v4i32 && "Only know how to lower v4i32");
+  
+  Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                   DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                   Op.getOperand(1), DAG.getConstant(23, MVT::i32));
+
+  std::vector<Constant*> CV;
+  LLVMContext *Context = DAG.getContext();
+  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
+  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
+  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
+  CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                               PseudoSourceValue::getConstantPool(), 0,
+                               false, false, 16);
+
+  Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
+  Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op);
+  Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+  return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+}
 
 SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
@@ -7730,6 +7762,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::SHL:                return LowerSHL(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 96c97d9f9ef..3556579ae65 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -723,6 +723,7 @@ namespace llvm {
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
new file mode 100644
index 00000000000..d8f4e4ec689
--- /dev/null
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
+
+define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
+entry:
+; CHECK-NOT: shll
+; CHECK: pslld
+; CHECK: paddd
+; CHECK: cvttps2dq
+; CHECK: pmulld
+
+  %shl = shl <4 x i32> %r, %a                     ; <<4 x i32>> [#uses=1]
+  %tmp2 = bitcast <4 x i32> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %tmp2
+}