From 04876e5fe54e81ce951196b52e3c6aebea54ede6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 9 Aug 2016 12:56:15 +0000 Subject: [PATCH] [X86][XOP] Add support for combining target shuffles to VPERMIL2PD/VPERMIL2PS git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278120 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 44 +++++++++++++++++++ .../X86/vector-shuffle-combining-xop.ll | 30 +++++++++++++ 2 files changed, 74 insertions(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 77fa9ff26f2..3f25cb15441 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3844,6 +3844,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { default: return false; case X86ISD::PSHUFB: case X86ISD::VPERMILPV: + case X86ISD::VPERMIL2: case X86ISD::VPPERM: return true; } @@ -25288,6 +25289,49 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } + // With XOP, binary shuffles of 128/256-bit floating point vectors can combine + // to VPERMIL2PD/VPERMIL2PS. + if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() && + (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || + MaskVT == MVT::v8f32)) { + // VPERMIL2 Operation. + // Bits[3] - Match Bit. + // Bits[2:1] - (Per Lane) PD Shuffle Mask. + // Bits[2:0] - (Per Lane) PS Shuffle Mask. + unsigned NumLanes = MaskVT.getSizeInBits() / 128; + unsigned NumEltsPerLane = NumMaskElts / NumLanes; + SmallVector VPerm2Idx; + MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits()); + MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts); + unsigned M2ZImm = 0; + for (int M : Mask) { + if (M == SM_SentinelUndef) { + VPerm2Idx.push_back(DAG.getUNDEF(MaskIdxSVT)); + continue; + } + if (M == SM_SentinelZero) { + M2ZImm = 2; + VPerm2Idx.push_back(DAG.getConstant(8, DL, MaskIdxSVT)); + continue; + } + int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane); + Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); + VPerm2Idx.push_back(DAG.getConstant(Index, DL, MaskIdxSVT)); + } + V1 = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(MaskVT, V2); + DCI.AddToWorklist(V2.getNode()); + SDValue VPerm2MaskOp = DAG.getBuildVector(MaskIdxVT, DL, VPerm2Idx); + DCI.AddToWorklist(VPerm2MaskOp.getNode()); + Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, + DAG.getConstant(M2ZImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // If we have 3 or more shuffle instructions or a chain involving a variable // mask, we can replace them with a single PSHUFB instruction profitably. // Intel's manuals suggest only using PSHUFB if doing so replacing 5 diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll index aefe3d03a19..9ef9da3bec4 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -30,6 +30,16 @@ define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x doubl ret <4 x double> %res1 } +define <4 x double> @combine_vpermil2pd256_0z73(<4 x double> %a0, <4 x double> %a1) { +; CHECK-LABEL: combine_vpermil2pd256_0z73: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3],ymm0[3] +; CHECK-NEXT: retq + %res0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> + %res1 = shufflevector <4 x double> %res0, <4 x double> zeroinitializer, <4 x i32> + ret <4 x double> %res1 +} + define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps_identity: ; CHECK: # BB#0: @@ -40,6 +50,16 @@ define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1 ret <4 x float> %res1 } +define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: combine_vpermil2ps_1z74: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0] +; CHECK-NEXT: retq + %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> , i8 0) + %res1 = shufflevector <4 x float> %res0, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res1 +} + define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps256_identity: ; CHECK: # BB#0: @@ -50,6 +70,16 @@ define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> ret <8 x float> %res1 } +define <8 x float> @combine_vpermil2ps256_08z945Az(<8 x float> %a0, <8 x float> %a1) { +; CHECK-LABEL: combine_vpermil2ps256_08z945Az: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0],zero,ymm1[1],ymm0[4,5],ymm1[6],zero +; CHECK-NEXT: retq + %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> , i8 0) + %res1 = shufflevector <8 x float> %res0, <8 x float> zeroinitializer, <8 x i32> + ret <8 x float> %res1 +} + define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps256_zero: ; CHECK: # BB#0: