From 87d1836793831974f4dc1b1e030b37d43beba223 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 21 May 2015 10:05:03 +0000 Subject: [PATCH] [X86][SSE] Improve support for 128-bit vector sign extension This patch improves support for sign extension of the lower lanes of vectors of integers by making use of the SSE41 pmovsx* sign extension instructions where possible, and optimizing the sign extension by shifts on pre-SSE41 targets (avoiding the use of i64 arithmetic shifts which require scalarization). It converts SIGN_EXTEND nodes to SIGN_EXTEND_VECTOR_INREG where necessary, that more closely matches the pmovsx* instruction than the default approach of using SIGN_EXTEND_INREG which splits the operation (into an ANY_EXTEND lowered to a shuffle followed by shifts) making instruction matching difficult during lowering. Necessary support for SIGN_EXTEND_VECTOR_INREG has been added to the DAGCombiner. Differential Revision: http://reviews.llvm.org/D9848 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237885 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 23 +- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 31 +- lib/Target/X86/X86ISelLowering.cpp | 121 ++++++- test/CodeGen/X86/pr15267.ll | 14 +- test/CodeGen/X86/vec_cast2.ll | 14 +- test/CodeGen/X86/vector-sext.ll | 376 +++++++--------------- test/CodeGen/X86/vselect-avx.ll | 19 +- 7 files changed, 298 insertions(+), 300 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3ff4c4062f1..77e648c152c 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -268,6 +268,7 @@ namespace { SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); + SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitTRUNCATE(SDNode *N); SDValue visitBITCAST(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); @@ -1347,6 +1348,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); + case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::TRUNCATE: return visitTRUNCATE(N); case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); @@ -5541,7 +5543,8 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, EVT VT = N->getValueType(0); assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || - Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!"); + Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) + && "Expected EXTEND dag node in input!"); // fold (sext c1) -> c1 // fold (zext c1) -> c1 @@ -5563,7 +5566,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits(); unsigned ShAmt = VTBits - EVTBits; SmallVector Elts; - unsigned NumElts = N0->getNumOperands(); + unsigned NumElts = VT.getVectorNumElements(); SDLoc DL(N); for (unsigned i=0; i != NumElts; ++i) { @@ -5576,7 +5579,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, SDLoc DL(Op); ConstantSDNode *CurrentND = cast(Op); const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue()); - if (Opcode == ISD::SIGN_EXTEND) + if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(), DL, SVT)); else @@ -6805,6 +6808,20 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + if (N0.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(VT); + + if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, + LegalOperations)) + return SDValue(Res, 0); + + return SDValue(); +} + SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6d75a7c9533..eaba9ca14eb 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3429,12 +3429,35 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, assert(EVT.bitsLE(VT) && "Not extending!"); if (EVT == VT) return N1; // Not actually extending + auto SignExtendInReg = [&](APInt Val) { + unsigned FromBits = EVT.getScalarType().getSizeInBits(); + Val <<= Val.getBitWidth() - FromBits; + Val = Val.ashr(Val.getBitWidth() - FromBits); + return getConstant(Val, DL, VT.getScalarType()); + }; + if (N1C) { APInt Val = N1C->getAPIntValue(); - unsigned FromBits = EVT.getScalarType().getSizeInBits(); - Val <<= Val.getBitWidth()-FromBits; - Val = Val.ashr(Val.getBitWidth()-FromBits); - return getConstant(Val, DL, VT); + return SignExtendInReg(Val); + } + if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) { + SmallVector Ops; + for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + SDValue Op = N1.getOperand(i); + if (Op.getValueType() != VT.getScalarType()) break; + if (Op.getOpcode() == ISD::UNDEF) { + Ops.push_back(Op); + continue; + } + if (ConstantSDNode *C = dyn_cast(Op.getNode())) { + APInt Val = C->getAPIntValue(); + Ops.push_back(SignExtendInReg(Val)); + continue; + } + break; + } + if (Ops.size() == VT.getVectorNumElements()) + return getNode(ISD::BUILD_VECTOR, DL, VT, Ops); } break; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index de28fdbb229..35d7c80c0db 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1004,6 +1004,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); @@ -13914,6 +13918,63 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } +static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT VT = Op->getSimpleValueType(0); + MVT InVT = In.getSimpleValueType(); + assert(VT.getSizeInBits() == InVT.getSizeInBits()); + + MVT SVT = VT.getScalarType(); + MVT InSVT = InVT.getScalarType(); + assert(SVT.getScalarSizeInBits() > InSVT.getScalarSizeInBits()); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + SDLoc dl(Op); + + // SSE41 targets can use the pmovsx* instructions directly. + if (Subtarget->hasSSE41()) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. + SDValue Curr = In; + MVT CurrVT = InVT; + + // As SRAI is only available on i16/i32 types, we expand only up to i32 + // and handle i64 separately. + while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) { + Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); + MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); + CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); + Curr = DAG.getNode(ISD::BITCAST, dl, CurrVT, Curr); + } + + SDValue SignExt = Curr; + if (CurrVT != InVT) { + unsigned SignExtShift = + CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits(); + SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(SignExtShift, dl, MVT::i8)); + } + + if (CurrVT == VT) + return SignExt; + + if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { + SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(31, dl, MVT::i8)); + SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); + return DAG.getNode(ISD::BITCAST, dl, VT, Ext); + } + + return SDValue(); +} + static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); @@ -17580,6 +17641,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -23683,16 +23746,19 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - SDLoc dl(N); + EVT SVT = VT.getScalarType(); + EVT InVT = N0->getValueType(0); + EVT InSVT = InVT.getScalarType(); + SDLoc DL(N); // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) // This exposes the sext to the sdivrem lowering, so that it directly extends // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - N0.getValueType() == MVT::i8 && VT == MVT::i32) { + InVT == MVT::i8 && VT == MVT::i32) { SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); return R.getValue(1); @@ -23700,14 +23766,57 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) { if (N0.getValueType() == MVT::i1) { - SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT); - return DAG.getNode(ISD::SELECT, dl, VT, N0, AllOnes, Zero); + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); } return SDValue(); } + if (VT.isVector()) { + auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) { + EVT InVT = N->getValueType(0); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), + 128 / InVT.getScalarSizeInBits()); + SmallVector Opnds(128 / InVT.getSizeInBits(), + DAG.getUNDEF(InVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + }; + + // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG + // which ensures lowering to X86ISD::VSEXT (pmovsx*). + if (VT.getSizeInBits() == 128 && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + SDValue ExOp = ExtendToVec128(DL, N0); + return DAG.getSignExtendVectorInReg(ExOp, DL, VT); + } + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::SIGN_EXTEND_VECTOR_INREG. + if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned NumVecs = VT.getSizeInBits() / 128; + unsigned NumSubElts = 128 / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + SmallVector Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; + ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendToVec128(DL, SrcVec); + SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + } + } + if (!Subtarget->hasFp256()) return SDValue(); diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll index e9f41d9ded4..95d7deb3417 100644 --- a/test/CodeGen/X86/pr15267.ll +++ b/test/CodeGen/X86/pr15267.ll @@ -50,20 +50,22 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind { ; CHECK: movq ; CHECK: shlq ; CHECK: sarq -; CHECK: vmovq ; CHECK: movq ; CHECK: shlq ; CHECK: sarq -; CHECK: vmovq -; CHECK: vpunpcklqdq +; CHECK: vmovd +; CHECK: vpinsrd ; CHECK: movq ; CHECK: shlq ; CHECK: sarq -; CHECK: vmovq +; CHECK: vpinsrd ; CHECK: shlq ; CHECK: sarq -; CHECK: vmovq -; CHECK: vpunpcklqdq +; CHECK: vpinsrd +; CHECK: vpmovsxdq +; CHECK: vmovd +; CHECK: vpinsrd +; CHECK: vpmovsxdq ; CHECK: vinsertf128 ; CHECK: ret diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index e50789570ed..1ba11f51baa 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -16,13 +16,9 @@ define <8 x float> @foo1_8(<8 x i8> %src) { ; ; CHECK-WIDE-LABEL: foo1_8: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-WIDE-NEXT: retl @@ -40,9 +36,7 @@ define <4 x float> @foo1_4(<4 x i8> %src) { ; ; CHECK-WIDE-LABEL: foo1_4: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i8> %src to <4 x float> diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index c4273710a91..e6acc7efaf3 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -10,37 +10,30 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_8i16_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: # kill: XMM0 XMM1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_8i16_to_8i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: # kill: XMM0 XMM1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $16, %xmm0 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_8i16_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 -; SSE41-NEXT: pslld $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pslld $16, %xmm1 -; SSE41-NEXT: psrad $16, %xmm1 +; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_8i16_to_8i32: @@ -58,13 +51,10 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; ; X32-SSE41-LABEL: sext_8i16_to_8i32: ; X32-SSE41: # BB#0: # %entry -; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pslld $16, %xmm0 -; X32-SSE41-NEXT: psrad $16, %xmm0 -; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X32-SSE41-NEXT: pslld $16, %xmm1 -; X32-SSE41-NEXT: psrad $16, %xmm1 +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl entry: %B = sext <8 x i16> %A to <8 x i32> @@ -74,68 +64,31 @@ entry: define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_4i32_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i32_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_4i32_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -154,20 +107,9 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; ; X32-SSE41-LABEL: sext_4i32_to_4i64: ; X32-SSE41: # BB#0: # %entry -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm1, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl entry: @@ -252,20 +194,26 @@ entry: define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { ; SSE2-LABEL: load_sext_test3: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movsbq 1(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_test3: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movsbq 1(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_test3: @@ -292,20 +240,22 @@ entry: define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { ; SSE2-LABEL: load_sext_test4: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movswq 2(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_test4: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movswq 2(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_test4: @@ -332,20 +282,18 @@ entry: define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { ; SSE2-LABEL: load_sext_test5: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movslq 4(%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: movslq (%rdi), %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_test5: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movslq 4(%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: movslq (%rdi), %rax -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_test5: @@ -410,72 +358,35 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2: # BB#0: ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i1_to_4i64: ; SSSE3: # BB#0: ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_4i1_to_4i64: ; SSE41: # BB#0: ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -500,20 +411,9 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE41: # BB#0: ; X32-SSE41-NEXT: pslld $31, %xmm0 ; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm1, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl %extmask = sext <4 x i1> %mask to <4 x i64> @@ -576,72 +476,35 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSE2: # BB#0: ; SSE2-NEXT: pslld $24, %xmm0 ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i8_to_4i64: ; SSSE3: # BB#0: ; SSSE3-NEXT: pslld $24, %xmm0 ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm1, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-NEXT: movd %xmm0, %rax -; SSSE3-NEXT: cltq -; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_4i8_to_4i64: ; SSE41: # BB#0: ; SSE41-NEXT: pslld $24, %xmm0 ; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm3 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: cltq -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -666,20 +529,9 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; X32-SSE41: # BB#0: ; X32-SSE41-NEXT: pslld $24, %xmm0 ; X32-SSE41-NEXT: psrad $24, %xmm0 -; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm1, %eax -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE41-NEXT: retl %extmask = sext <4 x i8> %mask to <4 x i64> diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll index ff26aebaac6..de04a097de0 100644 --- a/test/CodeGen/X86/vselect-avx.ll +++ b/test/CodeGen/X86/vselect-avx.ll @@ -14,8 +14,8 @@ target triple = "x86_64-apple-macosx" ; ; CHECK-LABEL: test: -; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535] -; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807] +; CHECK: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807] +; CHECK: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535] ; CHECK: ret define void @test(<4 x i16>* %a, <4 x i16>* %b) { body: @@ -33,13 +33,14 @@ body: ; of the condition. ; ; CHECK-LABEL: test2: -; CHECK: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 -; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]] -; CHECK: vblendvpd [[MASK]] -; CHECK: retq +; CHECK: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]] +; CHECK: vblendvpd [[MASK]] +; CHECK: retq define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { bb: %arrayidx1928 = getelementptr inbounds double*, double** %call1559, i64 %indvars.iv4198