[LoopSink] Allow sinking to PHI-use (2nd attempt)

This change allows sinking defs from loop preheader with PHI-use into loop body. Loop sink can now see through PHI-use and select incoming blocks of value being used as candidate sink destination.

It makes loop sink more effective so more LICM can be undone if proven unprofitable with profile info. It addresses the motivating case in D87551, without resorting to profile guided LICM which breaks canonicalization.

This is the 2nd attempt after D152772.
This commit is contained in:
Wenlei He 2023-06-23 09:30:06 -07:00
parent 8015ea6a6d
commit 9a868a902c
3 changed files with 205 additions and 6 deletions

View File

@ -177,13 +177,27 @@ static bool sinkInstruction(
SmallPtrSet<BasicBlock *, 2> BBs;
for (auto &U : I.uses()) {
Instruction *UI = cast<Instruction>(U.getUser());
// We cannot sink I to PHI-uses.
if (isa<PHINode>(UI))
return false;
// We cannot sink I if it has uses outside of the loop.
if (!L.contains(LI.getLoopFor(UI->getParent())))
return false;
BBs.insert(UI->getParent());
if (!isa<PHINode>(UI)) {
BBs.insert(UI->getParent());
continue;
}
// We cannot sink I to PHI-uses, try to look through PHI to find the incoming
// block of the value being used.
PHINode *PN = dyn_cast<PHINode>(UI);
BasicBlock *PhiBB = PN->getIncomingBlock(U);
// If value's incoming block is from loop preheader directly, there's no
// place to sink to, bailout.
if (L.getLoopPreheader() == PhiBB)
return false;
BBs.insert(PhiBB);
}
// findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
@ -238,9 +252,11 @@ static bool sinkInstruction(
}
}
// Replaces uses of I with IC in N
// Replaces uses of I with IC in N, except PHI-use which is being taken
// care of by defs in PHI's incoming blocks.
I.replaceUsesWithIf(IC, [N](Use &U) {
return cast<Instruction>(U.getUser())->getParent() == N;
Instruction *UIToReplace = cast<Instruction>(U.getUser());
return UIToReplace->getParent() == N && !isa<PHINode>(UIToReplace);
});
// Replaces uses of I with IC in blocks dominated by N
replaceDominatedUsesWith(&I, IC, DT, N);

View File

@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -S -verify-memoryssa -passes=loop-sink < %s | FileCheck %s
; Make sure that unprofitable loop ICM can be undone by loop sink, and loop sink can handle
; sinking through PHI use.
define dso_local i32 @_Z3fooii(i32 %arg, i32 %arg1, i32 %arg2) local_unnamed_addr #0 !prof !29 {
; CHECK-LABEL: define dso_local i32 @_Z3fooii
; CHECK-SAME: (i32 [[ARG:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !prof [[PROF29:![0-9]+]] {
; CHECK-NEXT: .l.check.preheader:
; CHECK-NEXT: br label [[DOTL_CHECK:%.*]]
; CHECK: .l.ret.loopexit:
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[I10:%.*]], [[DOTL_ITERATE:%.*]] ]
; CHECK-NEXT: ret i32 [[DOTLCSSA]]
; CHECK: .l.check:
; CHECK-NEXT: [[I4:%.*]] = phi i32 [ 0, [[DOTL_CHECK_PREHEADER:%.*]] ], [ [[I11:%.*]], [[DOTL_ITERATE]] ]
; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[ARG]], [[DOTL_CHECK_PREHEADER]] ], [ [[I10]], [[DOTL_ITERATE]] ]
; CHECK-NEXT: [[I6:%.*]] = icmp eq i32 [[I4]], [[ARG1]]
; CHECK-NEXT: br i1 [[I6]], label [[DOTL_COLD:%.*]], label [[DOTL_ITERATE]], !prof [[PROF30:![0-9]+]]
; CHECK: .l.cold:
; CHECK-NEXT: [[FLAG:%.*]] = icmp eq i32 [[ARG1]], 5
; CHECK-NEXT: br i1 [[FLAG]], label [[DOTL_COLD1:%.*]], label [[DOTL_COLD2:%.*]]
; CHECK: .l.cold1:
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[ARG2]], [[ARG2]]
; CHECK-NEXT: br label [[DOTL_COLD3:%.*]]
; CHECK: .l.cold2:
; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[ARG2]], [[ARG2]]
; CHECK-NEXT: br label [[DOTL_COLD3]]
; CHECK: .l.cold3:
; CHECK-NEXT: [[I7:%.*]] = phi i32 [ [[TMP1]], [[DOTL_COLD1]] ], [ [[TMP2]], [[DOTL_COLD2]] ]
; CHECK-NEXT: [[I8:%.*]] = tail call i32 @_Z3bari(i32 [[I5]])
; CHECK-NEXT: [[I9:%.*]] = add nsw i32 [[I8]], [[I7]]
; CHECK-NEXT: br label [[DOTL_ITERATE]]
; CHECK: .l.iterate:
; CHECK-NEXT: [[I10]] = phi i32 [ [[I9]], [[DOTL_COLD3]] ], [ [[I5]], [[DOTL_CHECK]] ]
; CHECK-NEXT: [[I11]] = add nuw nsw i32 [[I4]], 1
; CHECK-NEXT: [[I12:%.*]] = icmp eq i32 [[I11]], [[I10]]
; CHECK-NEXT: br i1 [[I12]], label [[DOTL_RET_LOOPEXIT:%.*]], label [[DOTL_CHECK]]
;
.l.check.preheader:
%flag = icmp eq i32 %arg1, 5
%tmp2 = add nsw i32 %arg2, %arg2
%tmp1 = mul nsw i32 %arg2, %arg2
br label %.l.check
.l.ret.loopexit: ; preds = %.l.iterate
%.lcssa = phi i32 [ %i10, %.l.iterate ]
ret i32 %.lcssa
.l.check: ; preds = %.l.iterate, %.l.check.preheader
%i4 = phi i32 [ 0, %.l.check.preheader ], [ %i11, %.l.iterate ]
%i5 = phi i32 [ %arg, %.l.check.preheader ], [ %i10, %.l.iterate ]
%i6 = icmp eq i32 %i4, %arg1
br i1 %i6, label %.l.cold, label %.l.iterate, !prof !30
.l.cold: ; preds = %.l.check
br i1 %flag, label %.l.cold1, label %.l.cold2
.l.cold1: ; preds = %.l.cold
br label %.l.cold3
.l.cold2: ; preds = %.l.cold
br label %.l.cold3
.l.cold3: ; preds = %.l.cold2, %.l.cold1
%i7 = phi i32 [ %tmp1, %.l.cold1 ], [ %tmp2, %.l.cold2 ]
%i8 = tail call i32 @_Z3bari(i32 %i5)
%i9 = add nsw i32 %i8, %i7
br label %.l.iterate
.l.iterate: ; preds = %.l.cold3, %.l.check
%i10 = phi i32 [ %i9, %.l.cold3 ], [ %i5, %.l.check ]
%i11 = add nuw nsw i32 %i4, 1
%i12 = icmp eq i32 %i11, %i10
br i1 %i12, label %.l.ret.loopexit, label %.l.check
}
declare dso_local i32 @_Z3bari(i32) local_unnamed_addr
attributes #0 = { "use-sample-profile" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
!2 = !{!"ProfileFormat", !"SampleProfile"}
!3 = !{!"TotalCount", i64 403}
!4 = !{!"MaxCount", i64 200}
!5 = !{!"MaxInternalCount", i64 0}
!6 = !{!"MaxFunctionCount", i64 1}
!7 = !{!"NumCounts", i64 6}
!8 = !{!"NumFunctions", i64 1}
!9 = !{!"IsPartialProfile", i64 0}
!10 = !{!"PartialProfileRatio", double 0.000000e+00}
!11 = !{!"DetailedSummary", !12}
!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
!13 = !{i32 10000, i64 200, i32 2}
!14 = !{i32 100000, i64 200, i32 2}
!15 = !{i32 200000, i64 200, i32 2}
!16 = !{i32 300000, i64 200, i32 2}
!17 = !{i32 400000, i64 200, i32 2}
!18 = !{i32 500000, i64 200, i32 2}
!19 = !{i32 600000, i64 200, i32 2}
!20 = !{i32 700000, i64 200, i32 2}
!21 = !{i32 800000, i64 200, i32 2}
!22 = !{i32 900000, i64 200, i32 2}
!23 = !{i32 950000, i64 200, i32 2}
!24 = !{i32 990000, i64 200, i32 2}
!25 = !{i32 999000, i64 1, i32 5}
!26 = !{i32 999900, i64 1, i32 5}
!27 = !{i32 999990, i64 1, i32 5}
!28 = !{i32 999999, i64 1, i32 5}
!29 = !{!"function_entry_count", i64 2}
!30 = !{!"branch_weights", i32 1, i32 201}

View File

@ -0,0 +1,68 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -S -verify-memoryssa -passes=loop-sink < %s | FileCheck %s
; Make sure that we handle PHI-uses correctly during loop sink if the most profitable sink
; destination also has a PHI of another use.
%struct.blam = type { %struct.blam.0, [32 x i8] }
%struct.blam.0 = type { ptr, i64 }
define internal void @wibble() !prof !0 {
; CHECK-LABEL: define internal void @wibble
; CHECK-SAME: () !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: bb:
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
; CHECK: bb2:
; CHECK-NEXT: ret void
; CHECK: bb3:
; CHECK-NEXT: switch i32 0, label [[BB5:%.*]] [
; CHECK-NEXT: i32 1, label [[BB4:%.*]]
; CHECK-NEXT: i32 0, label [[BB1]]
; CHECK-NEXT: ], !prof [[PROF1:![0-9]+]]
; CHECK: bb4:
; CHECK-NEXT: br i1 false, label [[BB3]], label [[BB6:%.*]]
; CHECK: bb5:
; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr [[STRUCT_BLAM:%.*]], ptr null, i64 0, i32 1
; CHECK-NEXT: br i1 false, label [[BB6]], label [[BB7:%.*]]
; CHECK: bb6:
; CHECK-NEXT: br label [[BB7]]
; CHECK: bb7:
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ null, [[BB6]] ], [ [[GETELEMENTPTR]], [[BB5]] ]
; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr [[STRUCT_BLAM]], ptr null, i64 0, i32 1
; CHECK-NEXT: store ptr [[GETELEMENTPTR1]], ptr null, align 8
; CHECK-NEXT: br label [[BB1]]
;
bb:
%getelementptr = getelementptr %struct.blam, ptr null, i64 0, i32 1
br label %bb1
bb1: ; preds = %bb7, %bb3, %bb
br i1 false, label %bb2, label %bb3
bb2: ; preds = %bb1
ret void
bb3: ; preds = %bb4, %bb1
switch i32 0, label %bb5 [
i32 1, label %bb4
i32 0, label %bb1
], !prof !1
bb4: ; preds = %bb3
br i1 false, label %bb3, label %bb6
bb5: ; preds = %bb3
br i1 false, label %bb6, label %bb7
bb6: ; preds = %bb5, %bb4
br label %bb7
bb7: ; preds = %bb6, %bb5
%phi = phi ptr [ null, %bb6 ], [ %getelementptr, %bb5 ]
store ptr %getelementptr, ptr null, align 8
br label %bb1
}
!0 = !{!"function_entry_count", i64 1}
!1 = !{!"branch_weights", i32 1, i32 188894, i32 287400}