Add wider vector/integer support for PR12312

- Enhance the fix to PR12312 to support wider integer, such as 256-bit integer. If more than 1 fully evaluated vectors are found, POR them first followed by the final PTEST. llvm-svn: 163832
2024-12-19 17:47:38 +00:00 · 2012-09-13 20:24:54 +00:00 · 2012-09-13 20:24:54 +00:00 · 0c0da113c5
commit 0c0da113c5
parent 7c620b0d5f
3 changed files with 223 additions and 113 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -8347,6 +8347,98 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
 }

+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+  SmallVector<SDValue, 8> VecIns;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+    VecIns.push_back(I->first);
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@ -8486,9 +8578,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
    switch (ArithOp.getOpcode()) {
    default: llvm_unreachable("unexpected operator!");
    case ISD::SUB: Opcode = X86ISD::SUB; break;
-    case ISD::OR:  Opcode = X86ISD::OR;  break;
    case ISD::XOR: Opcode = X86ISD::XOR; break;
    case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
    }

    NumOperands = 2;
@ -14205,84 +14305,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
  return SDValue();
 }

-/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
-/// updated. If only flag result is used and the result is evaluated from a
-/// series of element extraction, try to combine it into a PTEST.
-static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
-                                     SelectionDAG &DAG,
-                                     const X86Subtarget *Subtarget) {
-  SDNode *N = Or.getNode();
-  DebugLoc DL = N->getDebugLoc();
-
-  // Only SSE4.1 and beyond supports PTEST or like.
-  if (!Subtarget->hasSSE41())
-    return SDValue();
-
-  if (N->getOpcode() != X86ISD::OR)
-    return SDValue();
-
-  // Quit if the value result of OR is used.
-  if (N->hasAnyUseOfValue(0))
-    return SDValue();
-
-  // Quit if not used as a boolean value.
-  if (CC != X86::COND_E && CC != X86::COND_NE)
-    return SDValue();
-
-  SmallVector<SDValue, 8> Opnds;
-  SDValue VecIn;
-  EVT VT = MVT::Other;
-  unsigned Mask = 0;
-
-  // Recognize a special case where a vector is casted into wide integer to
-  // test all 0s.
-  Opnds.push_back(N->getOperand(0));
-  Opnds.push_back(N->getOperand(1));
-
-  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
-    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
-    // BFS traverse all OR'd operands.
-    if (I->getOpcode() == ISD::OR) {
-      Opnds.push_back(I->getOperand(0));
-      Opnds.push_back(I->getOperand(1));
-      // Re-evaluate the number of nodes to be traversed.
-      e += 2; // 2 more nodes (LHS and RHS) are pushed.
-      continue;
-    }
-
-    // Quit if a non-EXTRACT_VECTOR_ELT
-    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-      return SDValue();
-
-    // Quit if without a constant index.
-    SDValue Idx = I->getOperand(1);
-    if (!isa<ConstantSDNode>(Idx))
-      return SDValue();
-
-    // Check if all elements are extracted from the same vector.
-    SDValue ExtractedFromVec = I->getOperand(0);
-    if (VecIn.getNode() == 0) {
-      VT = ExtractedFromVec.getValueType();
-      // FIXME: only 128-bit vector is supported so far.
-      if (!VT.is128BitVector())
-        return SDValue();
-      VecIn = ExtractedFromVec;
-    } else if (VecIn != ExtractedFromVec)
-      return SDValue();
-
-    // Record the constant index.
-    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
-  }
-
-  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
-
-  // Quit if not all elements are used.
-  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
-    return SDValue();
-
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
-}
-
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
@ -14321,14 +14343,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                       Ops, array_lengthof(Ops));
  }

-  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
-  if (Flags.getNode()) {
-    SDValue Ops[] = { FalseOp, TrueOp,
-                      DAG.getConstant(CC, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
-                       Ops, array_lengthof(Ops));
-  }
-
  // If this is a select between two integer constants, try to do some
  // optimizations.  Note that the operands are ordered the opposite of SELECT
  // operands.
@ -15860,12 +15874,6 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
  }

-  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
-  if (Flags.getNode()) {
-    SDValue Cond = DAG.getConstant(CC, MVT::i8);
-    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
-  }
-
  return SDValue();
 }

@ -15889,13 +15897,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
                       Flags);
  }

-  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
-  if (Flags.getNode()) {
-    SDValue Cond = DAG.getConstant(CC, MVT::i8);
-    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
-                       Flags);
-  }
-
  return SDValue();
 }

--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -811,6 +811,8 @@ namespace llvm {
    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
    SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;

+    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
+
    SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const;

    virtual SDValue
--- a/test/CodeGen/X86/pr12312.ll
+++ b/test/CodeGen/X86/pr12312.ll
@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX

-define i32 @veccond(<4 x i32> %input) {
+define i32 @veccond128(<4 x i32> %input) {
 entry:
  %0 = bitcast <4 x i32> %input to i128
  %1 = icmp ne i128 %0, 0
@ -11,38 +11,145 @@ if-true-block:                                    ; preds = %entry
  ret i32 0
 endif-block:                                      ; preds = %entry,
  ret i32 1
-; SSE41: veccond
+; SSE41: veccond128
 ; SSE41: ptest
 ; SSE41: ret
-; AVX:   veccond
-; AVX:   vptest
+; AVX:   veccond128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
 ; AVX:   ret
 }

-define i32 @vectest(<4 x i32> %input) {
+define i32 @veccond256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest128(<4 x i32> %input) {
 entry:
  %0 = bitcast <4 x i32> %input to i128
  %1 = icmp ne i128 %0, 0
  %2 = zext i1 %1 to i32
  ret i32 %2
-; SSE41: vectest
+; SSE41: vectest128
 ; SSE41: ptest
 ; SSE41: ret
-; AVX:   vectest
-; AVX:   vptest
+; AVX:   vectest128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
 ; AVX:   ret
 }

-define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
+define i32 @vectest256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
 entry:
  %0 = bitcast <4 x i32> %input to i128
  %1 = icmp ne i128 %0, 0
  %2 = select i1 %1, i32 %a, i32 %b
  ret i32 %2
-; SSE41: vecsel
+; SSE41: vecsel128
 ; SSE41: ptest
 ; SSE41: ret
-; AVX:   vecsel
-; AVX:   vptest
+; AVX:   vecsel128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
 ; AVX:   ret
 }