From 8415f84e49664d6427af58f15741f404a309ebd2 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Wed, 24 Sep 2014 01:03:57 +0000
Subject: [PATCH] [x86] Fix a really terrible bug in the repeated 128-bin-lane
 shuffle detection. It was incorrectly handling undef lanes by actually
 treating an undef lane in the first 128-bit lane as a *numeric* shuffle
 value.

Fortunately, this almost always DTRT and disabled detecting repeated
patterns. But not always. =/ This patch introduces a much more
principled approach and fixes the miscompiles I spotted by inspection
previously.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218346 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        | 49 +++++++++++++++++------
 test/CodeGen/X86/vector-shuffle-256-v4.ll | 10 ++---
 2 files changed, 41 insertions(+), 18 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f01f86f0c0a..279c21db8b8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9203,13 +9203,35 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
 ///
 /// This checks a shuffle mask to see if it is performing the same
 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
-/// that it is also not lane-crossing.
-static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
   int LaneSize = 128 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
   int Size = Mask.size();
-  for (int i = LaneSize; i < Size; ++i)
-    if (Mask[i] >= 0 && Mask[i] != (Mask[i % LaneSize] + (i / LaneSize) * LaneSize))
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
       return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
   return true;
 }
 
@@ -9383,13 +9405,14 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
   // use lower latency instructions that will operate on both 128-bit lanes.
-  if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask)) {
+  SmallVector<int, 2> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
     if (isSingleInputShuffleMask(Mask)) {
       int PSHUFDMask[] = {-1, -1, -1, -1};
       for (int i = 0; i < 2; ++i)
-        if (Mask[i] >= 0) {
-          PSHUFDMask[2 * i] = 2 * Mask[i];
-          PSHUFDMask[2 * i + 1] = 2 * Mask[i] + 1;
+        if (RepeatedMask[i] >= 0) {
+          PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
+          PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
         }
       return DAG.getNode(
           ISD::BITCAST, DL, MVT::v4i64,
@@ -9453,16 +9476,16 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   // options to efficiently lower the shuffle.
-  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask)) {
-    ArrayRef<int> LoMask = Mask.slice(0, 4);
+  SmallVector<int, 2> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
     if (isSingleInputShuffleMask(Mask))
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
-                         getV4X86ShuffleImm8ForMask(LoMask, DAG));
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(LoMask, 0, 8, 1, 9))
+    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
-    if (isShuffleEquivalent(LoMask, 2, 10, 3, 11))
+    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index ddf041f51b7..00ee663210b 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -450,7 +450,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX2-LABEL: @shuffle_v4i64_4012
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT:    vpermq {{.*}} # ymm0 = ymm0[0,0,1,2]
 ; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
@@ -484,8 +484,8 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX2-LABEL: @shuffle_v4i64_0451
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*}} # ymm1 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vpshufd {{.*}} # ymm0 = ymm0[0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpermq {{.*}} # ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT:    vpermq {{.*}} # ymm0 = ymm0[0,1,2,1]
 ; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
@@ -519,8 +519,8 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX2-LABEL: @shuffle_v4i64_4015
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*}} # ymm1 = ymm1[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT:    vpermq {{.*}} # ymm1 = ymm1[0,1,2,1]
+; AVX2-NEXT:    vpermq {{.*}} # ymm0 = ymm0[0,0,1,3]
 ; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>