[x86] Teach the new vector shuffle lowering about the zero masking

abilities of INSERTPS which are really powerful and come up in very important contexts such as forming diagonal matrices, etc. With this I ended up being able to remove the somewhat weird helper I added for INSERTPS because we can collapse the entire state to a no-op mask. Added a bunch of tests for inserting into a zero-ish vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217117 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-12 06:06:19 +00:00 · 2014-09-04 01:13:48 +00:00 · 2014-09-04 01:13:48 +00:00 · fa2dfaedf2
commit fa2dfaedf2
parent e35ac41a3a
2 changed files with 152 additions and 22 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -7182,21 +7182,6 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
  return true;
 }

-/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
-///
-/// Mask entries pointing at the other input or undef will be skipped.
-static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i) {
-    int M = Mask[i];
-    if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
-      continue;
-    if (M - (LoInput ? 0 : Size) != i)
-      return false;
-  }
-  return true;
-}
-
 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
 // 2013 will allow us to use it as a non-type template parameter.
 namespace {
@ -7385,13 +7370,48 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    // INSERTPS when the V1 elements are already in the correct locations
    // because otherwise we can just always use two SHUFPS instructions which
    // are much smaller to encode than a SHUFPS and an INSERTPS.
-    if (Subtarget->hasSSE41() &&
-        isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
-      // Insert the V2 element into the desired position.
-      SDValue InsertPSMask =
-          DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
-      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
-                         InsertPSMask);
+    if (Subtarget->hasSSE41()) {
+      // When using INSERTPS we can zero any lane of the destination. Collect
+      // the zero inputs into a mask and drop them from the lanes of V1 which
+      // actually need to be present as inputs to the INSERTPS.
+      unsigned ZMask = 0;
+      if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+        ZMask = 0xF ^ (1 << V2Index);
+      } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+        for (int i = 0; i < 4; ++i) {
+          int M = Mask[i];
+          if (M >= 4)
+            continue;
+          if (M > -1) {
+            SDValue Input = V1.getOperand(M);
+            if (Input.getOpcode() != ISD::UNDEF &&
+                !X86::isZeroNode(Input)) {
+              // A non-zero input!
+              ZMask = 0;
+              break;
+            }
+          }
+          ZMask |= 1 << i;
+        }
+      }
+
+      // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+      int InsertShuffleMask[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i)
+        if (i != V2Index && (ZMask & (1 << i)) == 0)
+          InsertShuffleMask[i] = Mask[i];
+
+      if (isNoopShuffleMask(InsertShuffleMask)) {
+        // Replace V1 with undef if nothing from V1 survives the INSERTPS.
+        if ((ZMask | 1 << V2Index) == 0xF)
+          V1 = DAG.getUNDEF(MVT::v4f32);
+
+        // Insert the V2 element into the desired position.
+        SDValue InsertPSMask =
+            DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask);
+        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                           InsertPSMask);
+      }
    }

    // Compute the index adjacent to V2Index and in the same half by toggling
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@ -207,3 +207,113 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
  ret <4 x i32> %shuffle
 }
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_4zzz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][1,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X]][2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_4zzz
+; SSE41:         insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_4zzz
+; AVX1:         vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z4zz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][2,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][3,0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z4zz
+; SSE41:         insertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z4zz
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zz4z
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][0,0]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,0],xmm0[0,2]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zz4z
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zz4z
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zuu4
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zuu4
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zuu4
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zzz7
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[3,0],[[X]][2,0]
+; SSE2-NEXT:    shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %[[X]], %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zzz7
+; SSE41:         insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zzz7
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z6zz
+; SSE2:         xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][0,0]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][2,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z6zz
+; SSE41:         insertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z6zz
+; AVX1:         vinsertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}