This change fixes three bugs in loop unswitching. This change causes an 81% speed-up on a benchmark that is based on EigenConvolutionKernel2D from Eigen3, where the lack of loop unswitching blocks hoisting of loads out of a nested loop (see bug 23816 for how loop unswitching and load hoisting are related).

Change 1: Unswitching on trivial conditions should always happen regardless of the computed unswitching cost, as really the cost is zero. While there is code to make that happen, the logic that checks the unswitching cost against a threshold was moved to an earlier point (revision 147935) than the point where trivial unswitching is detected, so trivial unswitching is currently blocked by the cost threshold. This change fixes that. Change 2: Before revision 147935 (from 2012-01-11), the threshold parameter was a per-loop threshold. So an unswitching happened only if the cost of the unswitching was less than the threshold. In an indirect way (and I believe unintentionally), the logic for this since then has been that the threshold is an over-all budget across all loops for all loop unswitching done by a given LoopUnswitch loop pass object. So if an unswitching with cost 100 happens in one function, that in effect reduces the threshold from 100 to 0 for the loops even in another function. This persists for the lifetime of that loop pass object. This makes no difference for most small examples but it is important for large examples. This revision fixes that. Change 3: The cost is currently calculated as std::min(NumInstructions, 5 * NumBlocks). So a loop with 2 blocks and a million instructions will have an unswitching cost of 10. I changed this to just NumInstructions, as it were before revision 147935, though I'm open to e.g. instead replacing std::min with std::max. I've tried to make the change minimally invasive while staying with what I think was the original intent of the code. Submitted on behalf of broune@. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240438 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-03 08:41:44 +00:00 · 2015-06-23 18:26:50 +00:00 · 2015-06-23 18:26:50 +00:00 · fbd746614d
commit fbd746614d
parent 572587cad1
1 changed files with 59 additions and 34 deletions
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@ -81,6 +81,7 @@ namespace {

    struct LoopProperties {
      unsigned CanBeUnswitchedCount;
+      unsigned WasUnswitchedCount;
      unsigned SizeEstimation;
      UnswitchedValsMap UnswitchedVals;
    };
@ -94,37 +95,52 @@ namespace {
    UnswitchedValsMap *CurLoopInstructions;
    LoopProperties *CurrentLoopProperties;

-    // Max size of code we can produce on remained iterations.
+    // A loop unswitching with an estimated cost above this threshold
+    // is not performed. MaxSize is turned into unswitching quota for
+    // the current loop, and reduced correspondingly, though note that
+    // the quota is returned by releaseMemory() when the loop has been
+    // processed, so that MaxSize will return to its previous
+    // value. So in most cases MaxSize will equal the Threshold flag
+    // when a new loop is processed. An exception to that is that
+    // MaxSize will have a smaller value while processing nested loops
+    // that were introduced due to loop unswitching of an outer loop.
+    //
+    // FIXME: The way that MaxSize works is subtle and depends on the
+    // pass manager processing loops and calling releaseMemory() in a
+    // specific order. It would be good to find a more straightforward
+    // way of doing what MaxSize does.
    unsigned MaxSize;

-    public:
+  public:
+    LUAnalysisCache()
+        : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
+          MaxSize(Threshold) {}

-      LUAnalysisCache() :
-        CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
-        MaxSize(Threshold)
-      {}
+    // Analyze loop. Check its size, calculate is it possible to unswitch
+    // it. Returns true if we can unswitch this loop.
+    bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                   AssumptionCache *AC);

-      // Analyze loop. Check its size, calculate is it possible to unswitch
-      // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
-                     AssumptionCache *AC);
+    // Clean all data related to given loop.
+    void forgetLoop(const Loop *L);

-      // Clean all data related to given loop.
-      void forgetLoop(const Loop *L);
+    // Mark case value as unswitched.
+    // Since SI instruction can be partly unswitched, in order to avoid
+    // extra unswitching in cloned loops keep track all unswitched values.
+    void setUnswitched(const SwitchInst *SI, const Value *V);

-      // Mark case value as unswitched.
-      // Since SI instruction can be partly unswitched, in order to avoid
-      // extra unswitching in cloned loops keep track all unswitched values.
-      void setUnswitched(const SwitchInst *SI, const Value *V);
+    // Check was this case value unswitched before or not.
+    bool isUnswitched(const SwitchInst *SI, const Value *V);

-      // Check was this case value unswitched before or not.
-      bool isUnswitched(const SwitchInst *SI, const Value *V);
+    // Returns true if another unswitching could be done within the cost
+    // threshold.
+    bool CostAllowsUnswitching();

-      // Clone all loop-unswitch related loop properties.
-      // Redistribute unswitching quotas.
-      // Note, that new loop data is stored inside the VMap.
-      void cloneData(const Loop *NewLoop, const Loop *OldLoop,
-                     const ValueToValueMapTy &VMap);
+    // Clone all loop-unswitch related loop properties.
+    // Redistribute unswitching quotas.
+    // Note, that new loop data is stored inside the VMap.
+    void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                   const ValueToValueMapTy &VMap);
  };

  class LoopUnswitch : public LoopPass {
@ -246,12 +262,13 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
    // consideration code simplification opportunities and code that can
    // be shared by the resultant unswitched loops.
    CodeMetrics Metrics;
-    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-         I != E; ++I)
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+         ++I)
      Metrics.analyzeBasicBlock(*I, TTI, EphValues);

-    Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
+    Props.SizeEstimation = Metrics.NumInsts;
    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+    Props.WasUnswitchedCount = 0;
    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;

    if (Metrics.notDuplicatable) {
@ -262,13 +279,6 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
    }
  }

-  if (!Props.CanBeUnswitchedCount) {
-    DEBUG(dbgs() << "NOT unswitching loop %"
-                 << L->getHeader()->getName() << ", cost too high: "
-                 << L->getBlocks().size() << "\n");
-    return false;
-  }
-
  // Be careful. This links are good only before new loop addition.
  CurrentLoopProperties = &Props;
  CurLoopInstructions = &Props.UnswitchedVals;
@ -283,7 +293,8 @@ void LUAnalysisCache::forgetLoop(const Loop *L) {

  if (LIt != LoopsProperties.end()) {
    LoopProperties &Props = LIt->second;
-    MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation;
+    MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+               Props.SizeEstimation;
    LoopsProperties.erase(LIt);
  }

@ -303,6 +314,10 @@ bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
  return (*CurLoopInstructions)[SI].count(V);
 }

+bool LUAnalysisCache::CostAllowsUnswitching() {
+  return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
 // Clone all loop-unswitch related loop properties.
 // Redistribute unswitching quotas.
 // Note, that new loop data is stored inside the VMap.
@ -316,6 +331,8 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
  // Reallocate "can-be-unswitched quota"

  --OldLoopProps.CanBeUnswitchedCount;
+  ++OldLoopProps.WasUnswitchedCount;
+  NewLoopProps.WasUnswitchedCount = 0;
  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
@ -661,6 +678,14 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
  }

  // Check to see if it would be profitable to unswitch current loop.
+  if (!BranchesInfo.CostAllowsUnswitching()) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Cost too high.\n");
+    return false;
+  }

  // Do not do non-trivial unswitch while optimizing for size.
  if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize))