Add support for matching shuffle patterns with palignr.

llvm-svn: 84459
2024-12-30 15:45:26 +00:00 · 2009-10-19 02:17:23 +00:00 · 2009-10-19 02:17:23 +00:00 · 1308a36647
commit 1308a36647
parent 71505cfb64
6 changed files with 186 additions and 27 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -2389,6 +2389,56 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
  return ::isPSHUFLWMask(M, N->getValueType(0));
 }

+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                          bool hasSSSE3) {
+  int i, e = VT.getVectorNumElements();
+  
+  // Do not handle v2i64 / v2f64 shuffles with palignr.
+  if (e < 4 || !hasSSSE3)
+    return false;
+  
+  for (i = 0; i != e; ++i)
+    if (Mask[i] >= 0)
+      break;
+  
+  // All undef, not a palignr.
+  if (i == e)
+    return false;
+
+  // Determine if it's ok to perform a palignr with only the LHS, since we
+  // don't have access to the actual shuffle elements to see if RHS is undef.
+  bool Unary = Mask[i] < (int)e;
+  bool NeedsUnary = false;
+
+  int s = Mask[i] - i;
+  
+  // Check the rest of the elements to see if they are consecutive.
+  for (++i; i != e; ++i) {
+    int m = Mask[i];
+    if (m < 0) 
+      continue;
+    
+    Unary = Unary && (m < (int)e);
+    NeedsUnary = NeedsUnary || (m < s);
+
+    if (NeedsUnary && !Unary)
+      return false;
+    if (Unary && m != ((s+i) & (e-1)))
+      return false;
+    if (!Unary && m != (s+i))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isPALIGNRMask(M, N->getValueType(0), true);
+}
+
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to SHUFP*.
 static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
@ -2733,8 +2783,7 @@ bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
 }

 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
-/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
-/// instructions.
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
 unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
@ -2753,8 +2802,7 @@ unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
 }

 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
-/// instructions.
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  unsigned Mask = 0;
@ -2770,8 +2818,7 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
 }

 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
-/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
-/// instructions.
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  unsigned Mask = 0;
@ -2786,6 +2833,23 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
  return Mask;
 }

+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VVT = N->getValueType(0);
+  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+  int Val = 0;
+
+  unsigned i, e;
+  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+    Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      break;
+  }
+  return (Val - i) * EltSize;
+}
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
@ -7274,7 +7338,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
  if (VT.getSizeInBits() == 64)
    return false;

-  // FIXME: pshufb, blends, palignr, shifts.
+  // FIXME: pshufb, blends, shifts.
  return (VT.getVectorNumElements() == 2 ||
          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
          isMOVLMask(M, VT) ||
@ -7282,6 +7346,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
          isPSHUFDMask(M, VT) ||
          isPSHUFHWMask(M, VT) ||
          isPSHUFLWMask(M, VT) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
          isUNPCKLMask(M, VT) ||
          isUNPCKHMask(M, VT) ||
          isUNPCKL_v_undef_Mask(M, VT) ||
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -323,21 +323,27 @@ namespace llvm {
    /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
    bool isMOVDDUPMask(ShuffleVectorSDNode *N);

+    /// isPALIGNRMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PALIGNR.
+    bool isPALIGNRMask(ShuffleVectorSDNode *N);
+
    /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
    /// instructions.
    unsigned getShuffleSHUFImmediate(SDNode *N);

    /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
-    /// instructions.
+    /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction.
    unsigned getShufflePSHUFHWImmediate(SDNode *N);

-    /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
-    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
-    /// instructions.
+    /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction.
    unsigned getShufflePSHUFLWImmediate(SDNode *N);

+    /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+    unsigned getShufflePALIGNRImmediate(SDNode *N);
+
    /// isZeroNode - Returns true if Elt is a constant zero or a floating point
    /// constant +0.0.
    bool isZeroNode(SDValue Elt);
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -197,6 +197,12 @@ def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
 }]>;

+// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
+// a PALIGNR imm.
+def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePALIGNRImmediate(N));
+}]>;
+
 def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
                       (vector_shuffle node:$lhs, node:$rhs), [{
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
@ -283,6 +289,11 @@ def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
  return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
 }], SHUFFLE_get_pshuflw_imm>;

+def palign : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_palign_imm>;
+
 //===----------------------------------------------------------------------===//
 // SSE scalar FP Instructions
 //===----------------------------------------------------------------------===//
@ -2062,6 +2073,7 @@ defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;

 // Shuffle and unpack instructions
+let AddedComplexity = 5 in {
 def PSHUFDri : PDIi8<0x70, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@ -2073,6 +2085,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
                     [(set VR128:$dst, (v4i32 (pshufd:$src2
                                             (bc_v4i32(memopv2i64 addr:$src1)),
                                             (undef))))]>;
+}                                             

 // SSE2 with ImmT == Imm8 and XS prefix.
 def PSHUFHWri : Ii8<0x70, MRMSrcReg,
@ -2839,6 +2852,26 @@ let Constraints = "$src1 = $dst" in {
                              imm:$src3))]>, OpSize;
 }

+// palignr patterns.
+let AddedComplexity = 5 in {
+def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+}      
+
 def : Pat<(X86pshufb VR128:$src, VR128:$mask),
          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
 def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@ -0,0 +1,58 @@
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck --check-prefix=YONAH %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: pshufd
+; CHECK-YONAH: pshufd
+  %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 >
+	ret <4 x i32> %C
+}
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: palignr
+; CHECK-YONAH: shufps
+  %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 >
+	ret <4 x i32> %C
+}
+
+define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 >
+	ret <4 x i32> %C
+}
+
+define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
+	ret <4 x i32> %C
+}
+
+define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
+	ret <4 x float> %C
+}
+
+define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 >
+	ret <8 x i16> %C
+}
+
+define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 >
+	ret <8 x i16> %C
+}
+
+define <8 x i16> @test8(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 >
+	ret <8 x i16> %C
+}
+
+define <16 x i8> @test9(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: palignr
+  %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 >
+	ret <16 x i8> %C
+}
--- a/test/CodeGen/X86/vec_shuffle-22.ll
+++ b/test/CodeGen/X86/vec_shuffle-22.ll
@ -1,19 +1,15 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium-m -o %t
-; RUN: grep movlhps %t | count 1
-; RUN: grep pshufd %t | count 1
-; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
-; RUN: grep movlhps %t | count 1
-; RUN: grep movddup %t | count 1
+; RUN: llc < %s -march=x86 -mcpu=pentium-m  | FileCheck %s

 define <4 x float> @t1(<4 x float> %a) nounwind  {
-entry:
-        %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >       ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp1
+; CHECK: movlhps
+  %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >       ; <<4 x float>> [#uses=1]
+  ret <4 x float> %tmp1
 }

 define <4 x i32> @t2(<4 x i32>* %a) nounwind {
-entry:
-        %tmp1 = load <4 x i32>* %a;
+; CHECK: pshufd
+; CHECK: ret
+  %tmp1 = load <4 x i32>* %a;
 	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >		; <<4 x i32>> [#uses=1]
 	ret <4 x i32> %tmp2
 }
--- a/test/CodeGen/X86/vec_shuffle-9.ll
+++ b/test/CodeGen/X86/vec_shuffle-9.ll
@ -1,9 +1,10 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep punpck %t | count 2
-; RUN: not grep pextrw %t
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s

 define <4 x i32> @test(i8** %ptr) {
-entry:
+; CHECK: xorps
+; CHECK: punpcklbw
+; CHECK: punpcklwd
+
 	%tmp = load i8** %ptr		; <i8*> [#uses=1]
 	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
 	%tmp.upgrd.2 = load float* %tmp.upgrd.1		; <float> [#uses=1]