Codegen: MachineBlockPlacement Improve probability layout.

The following pattern was being layed out poorly: A / \ B C / \ / \ D E ? (Doesn't matter) Where A->B is far more likely than A->C, and prob(B->D) = prob(B->E) The current algorithm gives: A,B,C,E (D goes on worklist) It does this even if C has a frequency count of 0. This patch adjusts the layout calculation so that if freq(B->E) >> freq(C->E) then we go ahead and layout E rather than C. Fallthrough half the time is better than fallthrough never, or fallthrough very rarely. The resulting layout is: A,B,E, (C and D are in a worklist) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277187 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-09 00:51:41 +00:00 · 2016-07-29 18:09:28 +00:00 · 2016-07-29 18:09:28 +00:00 · 9f1f15e084
commit 9f1f15e084
parent 02e59638f8
2 changed files with 213 additions and 15 deletions
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@ -631,18 +631,46 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
  // BB->Succ. This is equivalent to looking the CFG backward with backward
  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
  // profile data).
-
+  // --------------------------------------------------------------------------
  // Case 3: forked diamond
  //       S
  //      / \
  //     /   \
  //   BB    Pred
  //   | \   / |
  //   |  \ /  |
  //   |   X   |
  //   |  / \  |
  //   | /   \ |
  //   S1     S2
  //
  // The current block is BB and edge BB->S1 is now being evaluated.
  // As above S->BB was already selected because
  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
  //
  // topo-order:
  //
  //     S-------|                     ---S
  //     |       |                     |  |
  //  ---BB      |                     |  BB
  //  |          |                     |  |
  //  |  Pred----|                     |  S1----
  //  |  |                             |       |
  //  --(S1 or S2)                     ---Pred--
  //
  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
  //    + min(freq(Pred->S1), freq(Pred->S2))
  // Non-topo-order cost:
  // In the worst case, S2 will not get laid out after Pred.
  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
  // is 0. Then the non topo layout is better when
  // freq(S->Pred) < freq(BB->S1).
  // This is exactly what is checked below.
  // Note there are other shapes that apply (Pred may not be a single block,
  // but they all fit this general pattern.)
  BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
  // Forward checking. For case 2, SuccProb will be 1.
  if (SuccProb < HotProb) {
    DEBUG(dbgs() << "    Not a candidate: " << getBlockName(Succ) << " "
                 << "Respecting topological ordering because "
                 << "probability is less than prob treshold: "
                 << SuccProb << "\n");
    return true;
  }
  // Make sure that a hot successor doesn't have a globally more
  // important predecessor.
  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
@ -653,11 +681,11 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
        (BlockFilter && !BlockFilter->count(Pred)) ||
        BlockToChain[Pred] == &Chain)
      continue;
-    // Do backward checking. For case 1, it is actually redundant check. For
+    // Do backward checking.
-    // case 2 above, we need a backward checking to filter out edges that are
+    // For all cases above, we need a backward checking to filter out edges that
-    // not 'strongly' biased. With profile data available, the check is mostly
+    // are not 'strongly' biased. With profile data available, the check is
-    // redundant too (when threshold prob is set at 50%) unless S has more than
+    // mostly redundant for case 2 (when threshold prob is set at 50%) unless S
-    // two successors.
+    // has more than two successors.
    // BB  Pred
    //  \ /
    //  Succ
@ -666,6 +694,8 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
    //      i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
    //      HotProb
    //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
    // Case 1 is covered too, because the first equation reduces to:
    // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
    BlockFrequency PredEdgeFreq =
        MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
    if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@ -1283,6 +1283,174 @@ exit:
  ret void
 }
 declare void @a()
 declare void @b()
 define void @test_forked_hot_diamond(i32* %a) {
 ; Test that a hot-branch with probability > 80% followed by a 50/50 branch
 ; will not place the cold predecessor if the probability for the fallthrough
 ; remains above 80%
 ; CHECK-LABEL: test_forked_hot_diamond
 ; CHECK: %entry
 ; CHECK: %then
 ; CHECK: %fork1
 ; CHECK: %else
 ; CHECK: %fork2
 ; CHECK: %exit
 entry:
  %gep1 = getelementptr i32, i32* %a, i32 1
  %val1 = load i32, i32* %gep1
  %cond1 = icmp ugt i32 %val1, 1
  br i1 %cond1, label %then, label %else, !prof !5
 then:
  call void @hot_function()
  %gep2 = getelementptr i32, i32* %a, i32 2
  %val2 = load i32, i32* %gep2
  %cond2 = icmp ugt i32 %val2, 2
  br i1 %cond2, label %fork1, label %fork2, !prof !8
 else:
  call void @cold_function()
  %gep3 = getelementptr i32, i32* %a, i32 3
  %val3 = load i32, i32* %gep3
  %cond3 = icmp ugt i32 %val3, 3
  br i1 %cond3, label %fork1, label %fork2, !prof !8
 fork1:
  call void @a()
  br label %exit
 fork2:
  call void @b()
  br label %exit
 exit:
  call void @hot_function()
  ret void
 }
 define void @test_forked_hot_diamond_gets_cold(i32* %a) {
 ; Test that a hot-branch with probability > 80% followed by a 50/50 branch
 ; will place the cold predecessor if the probability for the fallthrough
 ; falls below 80%
 ; The probability for both branches is 85%. For then2 vs else1
 ; this results in a compounded probability of 83%.
 ; Neither then2->fork1 nor then2->fork2 has a large enough relative
 ; probability to break the CFG.
 ; Relative probs:
 ; then2 -> fork1 vs else1 -> fork1 = 71%
 ; then2 -> fork2 vs else2 -> fork2 = 74%
 ; CHECK-LABEL: test_forked_hot_diamond_gets_cold
 ; CHECK: %entry
 ; CHECK: %then1
 ; CHECK: %then2
 ; CHECK: %else1
 ; CHECK: %fork1
 ; CHECK: %else2
 ; CHECK: %fork2
 ; CHECK: %exit
 entry:
  %gep1 = getelementptr i32, i32* %a, i32 1
  %val1 = load i32, i32* %gep1
  %cond1 = icmp ugt i32 %val1, 1
  br i1 %cond1, label %then1, label %else1, !prof !9
 then1:
  call void @hot_function()
  %gep2 = getelementptr i32, i32* %a, i32 2
  %val2 = load i32, i32* %gep2
  %cond2 = icmp ugt i32 %val2, 2
  br i1 %cond2, label %then2, label %else2, !prof !9
 else1:
  call void @cold_function()
  br label %fork1
 then2:
  call void @hot_function()
  %gep3 = getelementptr i32, i32* %a, i32 3
  %val3 = load i32, i32* %gep2
  %cond3 = icmp ugt i32 %val2, 3
  br i1 %cond3, label %fork1, label %fork2, !prof !8
 else2:
  call void @cold_function()
  br label %fork2
 fork1:
  call void @a()
  br label %exit
 fork2:
  call void @b()
  br label %exit
 exit:
  call void @hot_function()
  ret void
 }
 define void @test_forked_hot_diamond_stays_hot(i32* %a) {
 ; Test that a hot-branch with probability > 88.88% (1:8) followed by a 50/50
 ; branch will not place the cold predecessor as the probability for the
 ; fallthrough stays above 80%
 ; (1:8) followed by (1:1) is still (1:4)
 ; Here we use 90% probability because two in a row
 ; have a 89 % probability vs the original branch.
 ; CHECK-LABEL: test_forked_hot_diamond_stays_hot
 ; CHECK: %entry
 ; CHECK: %then1
 ; CHECK: %then2
 ; CHECK: %fork1
 ; CHECK: %else1
 ; CHECK: %else2
 ; CHECK: %fork2
 ; CHECK: %exit
 entry:
  %gep1 = getelementptr i32, i32* %a, i32 1
  %val1 = load i32, i32* %gep1
  %cond1 = icmp ugt i32 %val1, 1
  br i1 %cond1, label %then1, label %else1, !prof !10
 then1:
  call void @hot_function()
  %gep2 = getelementptr i32, i32* %a, i32 2
  %val2 = load i32, i32* %gep2
  %cond2 = icmp ugt i32 %val2, 2
  br i1 %cond2, label %then2, label %else2, !prof !10
 else1:
  call void @cold_function()
  br label %fork1
 then2:
  call void @hot_function()
  %gep3 = getelementptr i32, i32* %a, i32 3
  %val3 = load i32, i32* %gep2
  %cond3 = icmp ugt i32 %val2, 3
  br i1 %cond3, label %fork1, label %fork2, !prof !8
 else2:
  call void @cold_function()
  br label %fork2
 fork1:
  call void @a()
  br label %exit
 fork2:
  call void @b()
  br label %exit
 exit:
  call void @hot_function()
  ret void
 }
 !5 = !{!"branch_weights", i32 84, i32 16}
 !6 = !{!"function_entry_count", i32 10}
 !7 = !{!"branch_weights", i32 60, i32 40}
 !8 = !{!"branch_weights", i32 5001, i32 4999}
 !9 = !{!"branch_weights", i32 85, i32 15}
 !10 = !{!"branch_weights", i32 90, i32 10}