AVX-512: Implemented vector shuffle lowering for v8i64 and v8f64 types.

I removed the vector-shuffle-512-v8.ll, it is auto-generated test, not valid any more. llvm-svn: 238735
2025-02-04 03:06:28 +00:00 · 2015-06-01 09:49:53 +00:00 · 2015-06-01 09:49:53 +00:00 · 0afba4941b
commit 0afba4941b
parent 5ab2bd121b
3 changed files with 200 additions and 1485 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -6260,6 +6260,42 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
  return true;
 }

+/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 256-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = 256 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 256-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
@ -10054,12 +10090,15 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 }

 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                       const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
  SDLoc DL(Op);
-  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  MVT VT = Op.getSimpleValueType();
+  assert((V1.getSimpleValueType() == MVT::v8f64 ||
+          V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
+  assert((V2.getSimpleValueType() == MVT::v8f64 ||
+          V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  ArrayRef<int> Mask = SVOp->getMask();
  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@ -10067,12 +10106,65 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  // X86 has dedicated unpack instructions that can handle specific blend
  // operations: UNPCKH and UNPCKL.
  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+  // VSHUFPD instruction - mask 0/1, 8/9, 2/3, 10/11, 4/5, 12/13, 6/7, 14/15
+  bool ShufpdMask = true;
+  unsigned Immediate = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    int Val = (i & 6) + 8 * (i & 1);
+    if (Mask[i] < Val ||  Mask[i] > Val+1) {
+      ShufpdMask = false;
+      break;
+    }
+    Immediate |= (Mask[i]%2) << i;
+  }
+  if (ShufpdMask)
+      return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                         DAG.getConstant(Immediate, DL, MVT::i8));
+
+  // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7
+  if (isSingleInputShuffleMask(Mask)) {
+    bool PermilMask = true;
+    unsigned Immediate = 0;
+    for (int i = 0; i < 8; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      int Val = (i & 6);
+      if (Mask[i] < Val ||  Mask[i] > Val+1) {
+        PermilMask = false;
+        break;
+      }
+      Immediate |= (Mask[i]%2) << i;
+    }
+    if (PermilMask)
+      return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1,
+                         DAG.getConstant(Immediate, DL, MVT::i8));
+
+    SmallVector<int, 4> RepeatedMask;
+    if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) {
+      unsigned Immediate = 0;
+      for (int i = 0; i < 4; ++i)
+        if (RepeatedMask[i] > 0)
+          Immediate |= (RepeatedMask[i] & 3) << (i*2);
+      return DAG.getNode(X86ISD::VPERMI, DL, VT, V1,
+                         DAG.getConstant(Immediate, DL, MVT::i8));
+    }
+  }
+  SDValue VPermMask[8];
+  for (int i = 0; i < 8; ++i)
+    VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i64)
+                               : DAG.getConstant(Mask[i], DL, MVT::i64);
+  SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i64,
+                                 VPermMask);
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2);
 }

 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
@ -10104,28 +10196,6 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
 }

-/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
-  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
-
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
-
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
-}
-
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                       const X86Subtarget *Subtarget,
@ -10212,11 +10282,10 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  // the requisite ISA extensions for that element type are available.
  switch (VT.SimpleTy) {
  case MVT::v8f64:
-    return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i64:
+    return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG);
  case MVT::v16f32:
    return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
-  case MVT::v8i64:
-    return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
  case MVT::v16i32:
    return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
  case MVT::v32i16:
@ -20482,7 +20551,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
 #ifndef NDEBUG
    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
      for (int j = 0; j < LaneElts; ++j)
-        assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
               "Mask doesn't repeat in high 128-bit lanes!");
 #endif
    Mask.resize(LaneElts);
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@ -0,0 +1,98 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test1:
+; CHECK: vpermps
+; CHECK: ret
+define <16 x float> @test1(<16 x float> %a) nounwind {
+  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x float> %c
+}
+
+; CHECK-LABEL: test2:
+; CHECK: vpermd
+; CHECK: ret
+define <16 x i32> @test2(<16 x i32> %a) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: test3:
+; CHECK: vpermq
+; CHECK: ret
+define <8 x i64> @test3(<8 x i64> %a) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test4:
+; CHECK: vpermpd
+; CHECK: ret
+define <8 x double> @test4(<8 x double> %a) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test5:
+; CHECK: vpermt2pd
+; CHECK: ret
+define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test6:
+; CHECK: vpermq $30
+; CHECK: ret
+define <8 x i64> @test6(<8 x i64> %a) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test7:
+; CHECK: vpermt2q
+; CHECK: ret
+define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test14
+; CHECK: vpermilpd $203, %zmm
+; CHECK: ret
+define <8 x double> @test14(<8 x double> %a) {
+ %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
+ ret <8 x double> %b
+}
+
+; CHECK-LABEL: test17
+; CHECK: vshufpd $19, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test20
+; CHECK: vpunpckhqdq  %zmm
+; CHECK: ret
+define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
+ %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x i64> %b
+}
+
+; CHECK-LABEL: test21
+; CHECK: vbroadcastsd  %xmm0, %zmm
+; CHECK: ret
+define <8 x double> @test21(<8 x double> %a, <8 x double> %b) {
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+; CHECK-LABEL: test22
+; CHECK: vpbroadcastq  %xmm0, %zmm
+; CHECK: ret
+define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) {
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll