[X86][AVX512] Only combine EVEX targets shuffles to shuffles of the same number of vector elements

Over eager combing prevents the correct folding of writemasks. At the moment this occurs for ALL EVEX shuffles, in the future we need to check that the user of the root shuffle is a VSELECT that can fold to a writemask. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@279934 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-24 12:19:53 +00:00 · 2016-08-28 17:27:14 +00:00 · 2016-08-28 17:27:14 +00:00 · 337ddd9188
commit 337ddd9188
parent afa0d1049b
2 changed files with 20 additions and 8 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -25287,6 +25287,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  }

  unsigned RootSizeInBits = RootVT.getSizeInBits();
+  unsigned NumRootElts = RootVT.getVectorNumElements();
  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
  bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
                     (RootVT.is256BitVector() && !Subtarget.hasAVX2());
@ -25297,11 +25298,10 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
  // TODO - this currently prevents all lane shuffles from occurring.
  // TODO - check for writemasks usage instead of always preventing combining.
  // TODO - attempt to narrow Mask back to writemask size.
-  if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
-      (RootSizeInBits == 512 ||
-       (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
+  bool IsEVEXShuffle =
+      RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+  if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
    return false;
-  }

  // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

@ -25370,6 +25370,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
    if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) {
      if (Depth == 1 && Root.getOpcode() == Shuffle)
        return false; // Nothing to do!
+      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+        return false; // AVX512 Writemask clash.
      Res = DAG.getBitcast(ShuffleVT, V1);
      DCI.AddToWorklist(Res.getNode());
      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@ -25383,6 +25385,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                       ShuffleVT, PermuteImm)) {
      if (Depth == 1 && Root.getOpcode() == Shuffle)
        return false; // Nothing to do!
+      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+        return false; // AVX512 Writemask clash.
      Res = DAG.getBitcast(ShuffleVT, V1);
      DCI.AddToWorklist(Res.getNode());
      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
@ -25398,6 +25402,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                               ShuffleVT)) {
    if (Depth == 1 && Root.getOpcode() == Shuffle)
      return false; // Nothing to do!
+    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+      return false; // AVX512 Writemask clash.
    V1 = DAG.getBitcast(ShuffleVT, V1);
    DCI.AddToWorklist(V1.getNode());
    V2 = DAG.getBitcast(ShuffleVT, V2);
@ -25413,6 +25419,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                      Shuffle, ShuffleVT, PermuteImm)) {
    if (Depth == 1 && Root.getOpcode() == Shuffle)
      return false; // Nothing to do!
+    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+      return false; // AVX512 Writemask clash.
    V1 = DAG.getBitcast(ShuffleVT, V1);
    DCI.AddToWorklist(V1.getNode());
    V2 = DAG.getBitcast(ShuffleVT, V2);
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@ -3241,7 +3241,8 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    addq $24, %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
@ -4148,7 +4149,8 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, (%r14)
 ; AVX512VL-NEXT:    addq $16, %rsp
 ; AVX512VL-NEXT:    popq %rbx
@ -5136,7 +5138,8 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    addq $40, %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
@ -5939,7 +5942,8 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, (%r14)
 ; AVX512VL-NEXT:    addq $32, %rsp
 ; AVX512VL-NEXT:    popq %rbx