[Devirtualization] MemDep returns non-local !invariant.group dependencies

Summary:
Memory Dependence Analysis was limited to return only local dependencies
for invariant.group handling. Now it returns NonLocal when it finds it
and then by asking getNonLocalPointerDependency we get found dep.

Thanks to this we are able to devirtualize loops!

    void indirect(A &a, int n) {
      for (int i = 0 ; i < n; i++)
        a.foo();

    }
    void test(int n) {
      A a;
      indirect(a);
    }

After inlining a.foo() will be changed to direct call, even if foo and A::A()
is external (but only if vtable definition is be available).

Reviewers: nlewycky, dberlin, chandlerc, rsmith

Subscribers: mehdi_amini, davide, llvm-commits

Differential Revision: https://reviews.llvm.org/D28137

llvm-svn: 291762
This commit is contained in:
Piotr Padlewski 2017-01-12 11:33:58 +00:00
parent 592cce271e
commit 9032a3fc4f
6 changed files with 145 additions and 21 deletions

View File

@ -302,6 +302,10 @@ private:
NonLocalPointerInfo() : Size(MemoryLocation::UnknownSize) {}
};
/// Cache storing single nonlocal def for the instruction.
/// It is set when nonlocal def would be found in function returning only
/// local dependencies.
DenseMap<Instruction *, NonLocalDepResult> NonLocalDefsCache;
/// This map stores the cached results of doing a pointer lookup at the
/// bottom of a block.
///
@ -441,9 +445,9 @@ public:
/// This analysis looks for other loads and stores with invariant.group
/// metadata and the same pointer operand. Returns Unknown if it does not
/// find anything, and Def if it can be assumed that 2 instructions load or
/// store the same value.
/// FIXME: This analysis works only on single block because of restrictions
/// at the call site.
/// store the same value and NonLocal which indicate that non-local Def was
/// found, which can be retrieved by calling getNonLocalPointerDependency
/// with the same queried instruction.
MemDepResult getInvariantGroupPointerDependency(LoadInst *LI, BasicBlock *BB);
/// Looks at a memory location for a load (specified by MemLocBase, Offs, and

View File

@ -323,17 +323,28 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
MemDepResult InvariantGroupDependency = MemDepResult::getUnknown();
if (QueryInst != nullptr) {
if (auto *LI = dyn_cast<LoadInst>(QueryInst)) {
MemDepResult InvariantGroupDependency =
getInvariantGroupPointerDependency(LI, BB);
InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB);
if (InvariantGroupDependency.isDef())
return InvariantGroupDependency;
}
}
return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst,
Limit);
MemDepResult SimpleDep = getSimplePointerDependencyFrom(
MemLoc, isLoad, ScanIt, BB, QueryInst, Limit);
if (SimpleDep.isDef())
return SimpleDep;
// Non-local invariant group dependency indicates there is non local Def
// (it only returns nonLocal if it finds nonLocal def), which is better than
// local clobber and everything else.
if (InvariantGroupDependency.isNonLocal())
return InvariantGroupDependency;
assert(InvariantGroupDependency.isUnknown() &&
"InvariantGroupDependency should be only unknown at this point");
return SimpleDep;
}
MemDepResult
@ -358,6 +369,20 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
// Queue to process all pointers that are equivalent to load operand.
SmallVector<const Value *, 8> LoadOperandsQueue;
LoadOperandsQueue.push_back(LoadOperand);
Instruction *ClosestDependency = nullptr;
// Order of instructions in uses list is unpredictible. In order to always
// get the same result, we will look for the closest dominance.
auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) {
assert(Other && "Must call it with not null instruction");
if (Best == nullptr || DT.dominates(Best, Other))
return Other;
return Best;
};
// FIXME: This loop is O(N^2) because dominates can be O(n) and in worst case
// we will see all the instructions. This should be fixed in MSSA.
while (!LoadOperandsQueue.empty()) {
const Value *Ptr = LoadOperandsQueue.pop_back_val();
assert(Ptr && !isa<GlobalValue>(Ptr) &&
@ -388,12 +413,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
// If we hit load/store with the same invariant.group metadata (and the
// same pointer operand) we can assume that value pointed by pointer
// operand didn't change.
if ((isa<LoadInst>(U) || isa<StoreInst>(U)) && U->getParent() == BB &&
if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD)
return MemDepResult::getDef(U);
ClosestDependency = GetClosestDependency(ClosestDependency, U);
}
}
return MemDepResult::getUnknown();
if (!ClosestDependency)
return MemDepResult::getUnknown();
if (ClosestDependency->getParent() == BB)
return MemDepResult::getDef(ClosestDependency);
// Def(U) can't be returned here because it is non-local. If local
// dependency won't be found then return nonLocal counting that the
// user will call getNonLocalPointerDependency, which will return cached
// result.
NonLocalDefsCache.try_emplace(
LI, NonLocalDepResult(ClosestDependency->getParent(),
MemDepResult::getDef(ClosestDependency), nullptr));
return MemDepResult::getNonLocal();
}
MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
@ -877,7 +914,17 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
assert(Loc.Ptr->getType()->isPointerTy() &&
"Can't get pointer deps of a non-pointer!");
Result.clear();
{
// Check if there is cached Def with invariant.group. FIXME: cache might be
// invalid if cached instruction would be removed between call to
// getPointerDependencyFrom and this function.
auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst);
if (NonLocalDefIt != NonLocalDefsCache.end()) {
Result.push_back(std::move(NonLocalDefIt->second));
NonLocalDefsCache.erase(NonLocalDefIt);
return;
}
}
// This routine does not expect to deal with volatile instructions.
// Doing so would require piping through the QueryInst all the way through.
// TODO: volatiles can't be elided, but they can be reordered with other

View File

@ -65,22 +65,20 @@ if.then: ; preds = %entry
%vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
%vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
%call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
; FIXME: those loads could be also direct, but right now the invariant.group
; analysis works only on single block
; CHECK-NOT: call i32 @_ZN1A3fooEv(
; CHECK: call i32 @_ZN1A3fooEv(
%callx = tail call i32 %call1(%struct.A* %0) #1
%vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
%vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
%call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
; CHECK-NOT: call i32 @_ZN1A3fooEv(
; CHECK: call i32 @_ZN1A3fooEv(
%cally = tail call i32 %call4(%struct.A* %0) #1
%b = bitcast i8* %call to %struct.A**
%vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
%vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
%vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
; CHECK-NOT: call i32 @_ZN1A3fooEv(
; CHECK: call i32 @_ZN1A3fooEv(
%unknown = tail call i32 %vfun(%struct.A* %0) #1
br label %if.end

View File

@ -392,6 +392,44 @@ define void @testNotGlobal() {
ret void
}
; CHECK-LABEL: define void @handling_loops()
define void @handling_loops() {
%a = alloca %struct.A, align 8
%1 = bitcast %struct.A* %a to i8*
%2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
%3 = load i8, i8* @unknownPtr, align 4
%4 = icmp sgt i8 %3, 0
br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
.lr.ph.i: ; preds = %0
%5 = bitcast %struct.A* %a to void (%struct.A*)***
%6 = load i8, i8* @unknownPtr, align 4
%7 = icmp sgt i8 %6, 1
br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
._crit_edge.preheader: ; preds = %.lr.ph.i
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge
%8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
%.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
%9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
call void %9(%struct.A* nonnull %a) #3
; CHECK-NOT: call void %
%10 = add nuw nsw i8 %8, 1
%11 = load i8, i8* @unknownPtr, align 4
%12 = icmp slt i8 %10, %11
br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge
br label %_Z2g2R1A.exit
_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
ret void
}
declare void @foo(i8*)
declare void @foo2(i8*, i8)

View File

@ -66,22 +66,20 @@ if.then: ; preds = %entry
%vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
%vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
%call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
; FIXME: those loads could be also direct, but right now the invariant.group
; analysis works only on single block
; CHECK-NOT: call i32 @_ZN1A3fooEv(
; CHECK: call i32 @_ZN1A3fooEv(
%callx = tail call i32 %call1(%struct.A* %0) #1
%vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
%vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
%call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
; CHECK-NOT: call i32 @_ZN1A3fooEv(
; CHECK: call i32 @_ZN1A3fooEv(
%cally = tail call i32 %call4(%struct.A* %0) #1
%b = bitcast i8* %call to %struct.A**
%vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
%vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
%vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
; CHECK-NOT: call i32 @_ZN1A3fooEv(
; CHECK: call i32 @_ZN1A3fooEv(
%unknown = tail call i32 %vfun(%struct.A* %0) #1
br label %if.end

View File

@ -393,6 +393,45 @@ define void @testNotGlobal() {
ret void
}
; CHECK-LABEL: define void @handling_loops()
define void @handling_loops() {
%a = alloca %struct.A, align 8
%1 = bitcast %struct.A* %a to i8*
%2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
%3 = load i8, i8* @unknownPtr, align 4
%4 = icmp sgt i8 %3, 0
br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
.lr.ph.i: ; preds = %0
%5 = bitcast %struct.A* %a to void (%struct.A*)***
%6 = load i8, i8* @unknownPtr, align 4
%7 = icmp sgt i8 %6, 1
br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
._crit_edge.preheader: ; preds = %.lr.ph.i
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge
%8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
%.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
%9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
call void %9(%struct.A* nonnull %a) #3
; CHECK-NOT: call void %
%10 = add nuw nsw i8 %8, 1
%11 = load i8, i8* @unknownPtr, align 4
%12 = icmp slt i8 %10, %11
br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge
br label %_Z2g2R1A.exit
_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
ret void
}
declare void @foo(i8*)
declare void @foo2(i8*, i8)