The vpermilps and vpermilpd have different behaviour regarding the

usage of the shuffle bitmask. Both work in 128-bit lanes without crossing, but in the former the mask of the high part is the same used by the low part while in the later both lanes have independent masks. Handle this properly and and add support for vpermilpd. llvm-svn: 136200
2024-12-29 23:16:28 +00:00 · 2011-07-27 00:56:34 +00:00 · 2011-07-27 00:56:34 +00:00 · 8830fde434
commit 8830fde434
parent 1adb959ee8
5 changed files with 156 additions and 32 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -2717,7 +2717,10 @@ static bool isTargetShuffle(unsigned Opcode) {
  case X86ISD::PUNPCKHBW:
  case X86ISD::PUNPCKHDQ:
  case X86ISD::PUNPCKHQDQ:
-  case X86ISD::VPERMIL:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
    return true;
  }
  return false;
@ -2743,7 +2746,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
  case X86ISD::PSHUFD:
  case X86ISD::PSHUFHW:
  case X86ISD::PSHUFLW:
-  case X86ISD::VPERMIL:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
  }

@ -3400,21 +3406,63 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
  return ::isMOVLMask(M, N->getValueType(0));
 }

-/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
-static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                            const X86Subtarget *Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  if (!Subtarget->hasAVX())
+    return false;
+
+  // Match any permutation of 128-bit vector with 64-bit types
+  if (NumLanes == 1 && NumElts != 2)
+    return false;
+
+  // Only match 256-bit with 32 types
+  if (VT.getSizeInBits() == 256 && NumElts != 4)
+    return false;
+
+  // The mask on the high lane is independent of the low. Both can match
+  // any element in inside its own lane, but can't cross.
+  int LaneSize = NumElts/NumLanes;
+  for (int l = 0; l < NumLanes; ++l)
+    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      int LaneStart = l*LaneSize;
+      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+        return false;
+    }
+
+  return true;
+}
+
+/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                            const X86Subtarget *Subtarget) {
  unsigned NumElts = VT.getVectorNumElements();
  unsigned NumLanes = VT.getSizeInBits()/128;

-  // Match any permutation of 128-bit vector with 32/64-bit types
-  if (NumLanes == 1) {
-    if (NumElts == 4 || NumElts == 2)
-      return true;
+  if (!Subtarget->hasAVX())
    return false;
-  }

-  // Only match 256-bit with 32/64-bit types
-  if (NumElts != 8 && NumElts != 4)
+  // Match any permutation of 128-bit vector with 32-bit types
+  if (NumLanes == 1 && NumElts != 4)
+    return false;
+
+  // Only match 256-bit with 32 types
+  if (VT.getSizeInBits() == 256 && NumElts != 8)
    return false;

  // The mask on the high lane should be the same as the low. Actually,
@ -3424,7 +3472,6 @@ static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
    int HighElt = i+LaneSize;
    if (Mask[i] < 0 || Mask[HighElt] < 0)
      continue;
-
    if (Mask[HighElt]-Mask[i] != LaneSize)
      return false;
  }
@ -3432,9 +3479,9 @@ static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  return true;
 }

-/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMIL* instructions.
-static unsigned getShuffleVPERMILImmediate(SDNode *N) {
+/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
+static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  EVT VT = SVOp->getValueType(0);

@ -3448,6 +3495,24 @@ static unsigned getShuffleVPERMILImmediate(SDNode *N) {
  return Mask;
 }

+/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
+static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  unsigned Mask = 0;
+  int LaneSize = NumElts/NumLanes;
+  for (int l = 0; l < NumLanes; ++l)
+    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i)
+      Mask |= (SVOp->getMaskElt(i)-l*LaneSize) << i;
+
+  return Mask;
+}
+
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@ -4163,7 +4228,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                 Depth+1);
    }
-    case X86ISD::VPERMIL:
+    case X86ISD::VPERMILPS:
+    case X86ISD::VPERMILPSY:
+      // FIXME: Implement the other types
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
@ -5784,6 +5851,22 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
  return 0;
 }

+static inline unsigned getVPERMILOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32:
+  case MVT::v4f32: return X86ISD::VPERMILPS;
+  case MVT::v2i64:
+  case MVT::v2f64: return X86ISD::VPERMILPD;
+  case MVT::v8i32:
+  case MVT::v8f32: return X86ISD::VPERMILPSY;
+  case MVT::v4i64:
+  case MVT::v4f64: return X86ISD::VPERMILPDY;
+  default:
+    llvm_unreachable("Unknown type for vpermil");
+  }
+  return 0;
+}
+
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                               const TargetLowering &TLI,
@ -6123,14 +6206,25 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);

  //===--------------------------------------------------------------------===//
-  //  Custom lower or generate target specific nodes for 256-bit shuffles.
+  // Generate target specific nodes for 128 or 256-bit shuffles only
+  // supported in the AVX instruction set.
+  //

-  // Handle VPERMIL permutations
-  if (isVPERMILMask(M, VT)) {
-    unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
-    if (VT == MVT::v8f32)
-      return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
-  }
+  // Handle VPERMILPS* permutations
+  if (isVPERMILPSMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+                                getShuffleVPERMILPSImmediate(SVOp), DAG);
+
+  // Handle VPERMILPD* permutations
+  if (isVPERMILPDMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+                                getShuffleVPERMILPDImmediate(SVOp), DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Since no target specific shuffle was selected for this generic one,
+  // lower it into other known shuffles. FIXME: this isn't true yet, but
+  // this is the plan.
+  //

  // Handle general 256-bit shuffles
  if (VT.is256BitVector())
@ -9748,7 +9842,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
  case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
  case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
-  case X86ISD::VPERMIL:            return "X86ISD::VPERMIL";
+  case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
+  case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
+  case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
+  case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@ -12666,7 +12763,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
  case X86ISD::PSHUFLW:
  case X86ISD::MOVSS:
  case X86ISD::MOVSD:
-  case X86ISD::VPERMIL:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
  }

--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -271,7 +271,10 @@ namespace llvm {
      PUNPCKHWD,
      PUNPCKHDQ,
      PUNPCKHQDQ,
-      VPERMIL,
+      VPERMILPS,
+      VPERMILPSY,
+      VPERMILPD,
+      VPERMILPDY,

      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
      // according to %al. An operator is needed so that this can be expanded
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@ -153,7 +153,10 @@ def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
 def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
 def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;

-def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>;
+def X86VPermilps  : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>;
+def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>;
+def X86VPermilpd  : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>;
+def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>;

 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -5522,6 +5522,12 @@ def : Pat<(int_x86_avx_vperm2f128_si_256
                  VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;

+// Shuffle with VPERMIL instructions
+def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 //
@ -5543,10 +5549,6 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
 // The AVX version of some but not all of them are described here, and more
 // should come in a near future.

-// Shuffle with VPERMIL instructions
-def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPSYri VR256:$src1, imm:$imm)>;
-
 // Shuffle with PSHUFD instruction folding loads. The first two patterns match
 // SSE2 loads, which are always promoted to v2i64. The last one should match
 // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD
--- a/test/CodeGen/X86/avx-vpermil.ll
+++ b/test/CodeGen/X86/avx-vpermil.ll
@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+; CHECK: vpermilps
+define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
+  ret <8 x float> %shuffle
+}
+
+; CHECK: vpermilpd
+define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
+  ret <4 x double> %shuffle
+}
+