Temporarily Revert "[SLP] allow forming 2-way reduction patterns" and update testcases.

After speaking with Sanjay - seeing a number of miscompiles and working on tracking down a testcase. None of the follow on patches seem to have helped so far. This reverts commit 8a0aa5310bccbb42d16d11db090419fcefdd1376.
2025-02-13 00:10:37 +00:00 · 2019-11-20 16:00:53 -08:00 · 2019-11-20 16:00:53 -08:00 · 7b534b3c55
commit 7b534b3c55
parent 6d982d0bf7
5 changed files with 29 additions and 53 deletions
--- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@ -114,12 +114,9 @@ private:

  /// Try to find horizontal reduction or otherwise vectorize a chain of binary
  /// operators.
-  /// \p Try2WayRdx specializes the analysis to only attempt a 2-element
-  /// reduction.
  bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
                                slpvectorizer::BoUpSLP &R,
-                                TargetTransformInfo *TTI,
-                                bool Try2WayRdx = false);
+                                TargetTransformInfo *TTI);

  /// Try to vectorize trees that start at insertvalue instructions.
  bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -6653,7 +6653,7 @@ public:

  /// Attempt to vectorize the tree found by
  /// matchAssociativeReduction.
-  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) {
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
    if (ReducedVals.empty())
      return false;

@ -6661,14 +6661,11 @@ public:
    // to a nearby power-of-2. Can safely generate oversized
    // vectors and rely on the backend to split them to legal sizes.
    unsigned NumReducedVals = ReducedVals.size();
-    if (Try2WayRdx && NumReducedVals != 2)
-      return false;
-    unsigned MinRdxVals = Try2WayRdx ? 2 : 4;
-    if (NumReducedVals < MinRdxVals)
+    if (NumReducedVals < 4)
      return false;

    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-    unsigned MinRdxWidth = Log2_32(MinRdxVals);
+
    Value *VectorizedTree = nullptr;

    // FIXME: Fast-math-flags should be set based on the instructions in the
@ -6704,7 +6701,7 @@ public:
    SmallVector<Value *, 16> IgnoreList;
    for (auto &V : ReductionOps)
      IgnoreList.append(V.begin(), V.end());
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) {
+    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
      auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
      V.buildTree(VL, ExternallyUsedValues, IgnoreList);
      Optional<ArrayRef<unsigned>> Order = V.bestOrder();
@ -7048,7 +7045,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI, bool Try2WayRdx,
+    TargetTransformInfo *TTI,
    const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
  if (!ShouldVectorizeHor)
    return false;
@ -7079,7 +7076,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
    if (BI || SI) {
      HorizontalReduction HorRdx;
      if (HorRdx.matchAssociativeReduction(P, Inst)) {
-        if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) {
+        if (HorRdx.tryToReduce(R, TTI)) {
          Res = true;
          // Set P to nullptr to avoid re-analysis of phi node in
          // matchAssociativeReduction function unless this is the root node.
@ -7122,8 +7119,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(

 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                 BasicBlock *BB, BoUpSLP &R,
-                                                 TargetTransformInfo *TTI,
-                                                 bool Try2WayRdx) {
+                                                 TargetTransformInfo *TTI) {
  if (!V)
    return false;
  auto *I = dyn_cast<Instruction>(V);
@ -7136,7 +7132,7 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
  auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
    return tryToVectorize(I, R);
  };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx,
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
                                                  ExtraVectorization);
 }

@ -7332,23 +7328,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
      PostProcessInstructions.push_back(&*it);
  }

-  // Make a final attempt to match a 2-way reduction if nothing else worked.
-  // We do not try this above because it may interfere with other vectorization
-  // attempts.
-  // TODO: The constraints are copied from the above call to
-  //       vectorizeRootInstruction(), but that might be too restrictive?
-  BasicBlock::iterator LastInst = --BB->end();
-  if (!Changed && LastInst->use_empty() &&
-      (LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) ||
-       isa<InvokeInst>(LastInst))) {
-    if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) {
-      for (auto *V : LastInst->operand_values()) {
-        Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI,
-                                            /* Try2WayRdx */ true);
-      }
-    }
-  }
-
  return Changed;
 }

--- a/test/Feature/weak_constant.ll
+++ b/test/Feature/weak_constant.ll
@ -1,5 +1,5 @@
 ; RUN: opt < %s -O3 -S > %t
-; RUN:   grep undef %t | count 2
+; RUN:   grep undef %t | count 1
 ; RUN:   grep 5 %t | count 1
 ; RUN:   grep 7 %t | count 1
 ; RUN:   grep 9 %t | count 1
--- a/test/Transforms/SLPVectorizer/X86/reduction.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction.ll
@ -129,16 +129,15 @@ define i32 @horiz_max_multiple_uses([32 x i32]* %x, i32* %p) {
 define i1 @bad_insertpoint_rdx([8 x i32]* %p) #0 {
 ; CHECK-LABEL: @bad_insertpoint_rdx(
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX22]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 0
-; CHECK-NEXT:    [[SPEC_STORE_SELECT87:%.*]] = zext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[CMP23_2:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT87]], [[OP_EXTRA]]
+; CHECK-NEXT:    [[T0:%.*]] = load i32, i32* [[ARRAYIDX22]], align 16
+; CHECK-NEXT:    [[CMP23:%.*]] = icmp sgt i32 [[T0]], 0
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP23]], i32 [[T0]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[P]], i64 0, i64 1
+; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* [[ARRAYIDX22_1]], align 4
+; CHECK-NEXT:    [[CMP23_1:%.*]] = icmp sgt i32 [[T1]], [[SPEC_SELECT]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT87:%.*]] = zext i1 [[CMP23_1]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT88:%.*]] = select i1 [[CMP23_1]], i32 [[T1]], i32 [[SPEC_SELECT]]
+; CHECK-NEXT:    [[CMP23_2:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT87]], [[SPEC_SELECT88]]
 ; CHECK-NEXT:    ret i1 [[CMP23_2]]
 ;
  %arrayidx22 = getelementptr inbounds [8 x i32], [8 x i32]* %p, i64 0, i64 0
--- a/test/Transforms/SLPVectorizer/X86/reduction2.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction2.ll
@ -54,10 +54,10 @@ define double @foo(double* nocapture %D) {
 define i1 @two_wide_fcmp_reduction(<2 x double> %a0) {
 ; CHECK-LABEL: @two_wide_fcmp_reduction(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
-; CHECK-NEXT:    ret i1 [[TMP1]]
+; CHECK-NEXT:    [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0
+; CHECK-NEXT:    [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1
+; CHECK-NEXT:    [[D:%.*]] = and i1 [[B]], [[C]]
+; CHECK-NEXT:    ret i1 [[D]]
 ;
  %a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0>
  %b = extractelement <2 x i1> %a, i32 0
@ -96,11 +96,12 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
 ; CHECK:       lor.lhs.false:
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0