[LoopVectorize] Register cloned assumptions

InstCombine cannot effectively remove redundant assumptions without them registered in the assumption cache. The vectorizer can create identical assumptions but doesn't register them with the cache, resulting in slower compile times because InstCombine tries to reason about a lot more assumptions. Fix this by registering the cloned assumptions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@265800 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-10 06:24:58 +00:00 · 2016-04-08 16:37:10 +00:00 · 2016-04-08 16:37:10 +00:00 · 951ea8be17
commit 951ea8be17
parent 7500ba0386
2 changed files with 56 additions and 10 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -314,13 +314,13 @@ public:
  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                      LoopInfo *LI, DominatorTree *DT,
                      const TargetLibraryInfo *TLI,
-                      const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor)
+                      const TargetTransformInfo *TTI, AssumptionCache *AC,
+                      unsigned VecWidth, unsigned UnrollFactor)
      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
-        Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
-        TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false) {}
+        AC(AC), VF(VecWidth), UF(UnrollFactor),
+        Builder(PSE.getSE()->getContext()), Induction(nullptr),
+        OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr),
+        VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {}

  // Perform the actual loop widening (vectorization).
  // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
@ -524,6 +524,8 @@ protected:
  const TargetLibraryInfo *TLI;
  /// Target Transform Info.
  const TargetTransformInfo *TTI;
+  /// Assumption Cache.
+  AssumptionCache *AC;

  /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
  /// used.
@ -591,8 +593,10 @@ public:
  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                    LoopInfo *LI, DominatorTree *DT,
                    const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+                    const TargetTransformInfo *TTI, AssumptionCache *AC,
+                    unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, 1,
+                            UnrollFactor) {}

 private:
  void scalarizeInstruction(Instruction *Instr,
@ -1957,7 +1961,7 @@ struct LoopVectorize : public FunctionPass {
      assert(IC > 1 && "interleave count should not be 1 or 0");
      // If we decided that it is not legal to vectorize the loop then
      // interleave it.
-      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
+      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, IC);
      Unroller.vectorize(&LVL, CM.MinBWs);

      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
@ -1965,7 +1969,7 @@ struct LoopVectorize : public FunctionPass {
                                 Twine(IC) + ")");
    } else {
      // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, VF.Width, IC);
      LB.vectorize(&LVL, CM.MinBWs);
      ++LoopsVectorized;

@ -2728,6 +2732,11 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
      // Place the cloned scalar in the new loop.
      Builder.Insert(Cloned);

+      // If we just cloned a new assumption, add it the assumption cache.
+      if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+
      // If the original scalar returns a value we need to place it in a vector
      // so that future users will be able to use it.
      if (!IsVoidRetTy)
@ -6096,6 +6105,11 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
      // Place the cloned scalar in the new loop.
      Builder.Insert(Cloned);

+      // If we just cloned a new assumption, add it the assumption cache.
+      if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+
      // If the original scalar returns a value we need to place it in a vector
      // so that future users will be able to use it.
      if (!IsVoidRetTy)
--- a/test/Transforms/LoopVectorize/X86/register-assumption.ll
+++ b/test/Transforms/LoopVectorize/X86/register-assumption.ll
@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -instcombine -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1() {
+entry:
+  %alloca = alloca float, align 4
+  br label %loop_exit.dim.11.critedge
+
+loop_exit.dim.11.critedge:                        ; preds = %loop_body.dim.0
+  %ptrint = ptrtoint float* %alloca to i64
+  %maskedptr = and i64 %ptrint, 4
+  %maskcond = icmp eq i64 %maskedptr, 0
+  br label %loop_header.dim.017.preheader
+
+loop_header.dim.017.preheader:                    ; preds = %loop_exit.dim.016, %loop_exit.dim.11.critedge
+  br label %loop_body.dim.018
+
+loop_body.dim.018:                                ; preds = %loop_body.dim.018, %loop_header.dim.017.preheader
+  %invar_address.dim.019.0135 = phi i64 [ 0, %loop_header.dim.017.preheader ], [ %0, %loop_body.dim.018 ]
+  call void @llvm.assume(i1 %maskcond)
+; CHECK:     call void @llvm.assume(
+; CHECK-NOT: call void @llvm.assume(
+  %0 = add nuw nsw i64 %invar_address.dim.019.0135, 1
+  %1 = icmp eq i64 %0, 256
+  br i1 %1, label %loop_header.dim.017.preheader, label %loop_body.dim.018
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind }