[GlobalOpt] Demote globals to locals more aggressively

Global to local demotion can speed up programs that use globals a lot. It is particularly useful with LTO, when the entire call graph is known and most functions have been internalized.

For a global to be demoted, it must only be accessed by one function and that function:
  1. Must never recurse directly or indirectly, else the GV would be clobbered.
  2. Must never rely on the value in GV at the start of the function (apart from the initializer).

GlobalOpt can already do this, but it is hamstrung and only ever tries to demote globals inside "main", because C++ gives extra guarantees about how main is called - once and only once.

In LTO mode, we can often prove the first property (if the function is internal by this point, we know enough about the callgraph to determine if it could possibly recurse). FunctionAttrs now infers the "norecurse" attribute for this reason.

The second property can be proven for a subset of functions by proving that all loads from GV are dominated by a store to GV. This is conservative in the name of compile time - this only requires a DominatorTree which is fairly cheap in the grand scheme of things. We could do more fancy stuff with MemoryDependenceAnalysis too to catch more cases but this appears to catch most of the useful ones in my testing.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253168 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
James Molloy 2015-11-15 14:21:37 +00:00
parent fd671c136f
commit fb7bc35602
4 changed files with 158 additions and 9 deletions

View File

@ -28,6 +28,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@ -69,6 +70,7 @@ namespace {
struct GlobalOpt : public ModulePass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
}
static char ID; // Pass identification, replacement for typeid
GlobalOpt() : ModulePass(ID) {
@ -85,7 +87,9 @@ namespace {
bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
const GlobalStatus &GS);
bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
bool isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV);
TargetLibraryInfo *TLI;
SmallSet<const Comdat *, 8> NotDiscardableComdats;
};
@ -95,6 +99,7 @@ char GlobalOpt::ID = 0;
INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
"Global Variable Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(GlobalOpt, "globalopt",
"Global Variable Optimizer", false, false)
@ -1718,15 +1723,78 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
return ProcessInternalGlobal(GV, GVI, GS);
}
bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) {
// Find all uses of GV. We expect them all to be in F, and if we can't
// identify any of the uses we bail out.
//
// On each of these uses, identify if the memory that GV points to is
// used/required/live at the start of the function. If it is not, for example
// if the first thing the function does is store to the GV, the GV can
// possibly be demoted.
//
// We don't do an exhaustive search for memory operations - simply look
// through bitcasts as they're quite common and benign.
const DataLayout &DL = GV->getParent()->getDataLayout();
SmallVector<LoadInst *, 4> Loads;
SmallVector<StoreInst *, 4> Stores;
for (auto *U : GV->users()) {
if (Operator::getOpcode(U) == Instruction::BitCast) {
for (auto *UU : U->users()) {
if (auto *LI = dyn_cast<LoadInst>(UU))
Loads.push_back(LI);
else if (auto *SI = dyn_cast<StoreInst>(UU))
Stores.push_back(SI);
else
return false;
}
continue;
}
Instruction *I = dyn_cast<Instruction>(U);
if (!I)
return false;
assert(I->getParent()->getParent() == F);
if (auto *LI = dyn_cast<LoadInst>(I))
Loads.push_back(LI);
else if (auto *SI = dyn_cast<StoreInst>(I))
Stores.push_back(SI);
else
return false;
}
// We have identified all uses of GV into loads and stores. Now check if all
// of them are known not to depend on the value of the global at the function
// entry point. We do this by ensuring that every load is dominated by at
// least one store.
auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F))
.getDomTree();
for (auto *L : Loads) {
auto *LTy = L->getType();
if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) {
auto *STy = S->getValueOperand()->getType();
// The load is only dominated by the store if DomTree says so
// and the number of bits loaded in L is less than or equal to
// the number of bits stored in S.
return DT.dominates(S, L) &&
DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy);
}))
return false;
}
// All loads have known dependences inside F, so the global can be localized.
return true;
}
/// Analyze the specified global variable and optimize
/// it if possible. If we make a change, return true.
bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
Module::global_iterator &GVI,
const GlobalStatus &GS) {
auto &DL = GV->getParent()->getDataLayout();
// If this is a first class global and has only one accessing function
// and this function is main (which we know is not recursive), we replace
// the global with a local alloca in this function.
// If this is a first class global and has only one accessing function and
// this function is non-recursive, we replace the global with a local alloca
// in this function.
//
// NOTE: It doesn't make sense to promote non-single-value types since we
// are just replacing static memory to stack memory.
@ -1735,9 +1803,10 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
if (!GS.HasMultipleAccessingFunctions &&
GS.AccessingFunction && !GS.HasNonInstructionUser &&
GV->getType()->getElementType()->isSingleValueType() &&
GS.AccessingFunction->getName() == "main" &&
GS.AccessingFunction->hasExternalLinkage() &&
GV->getType()->getAddressSpace() == 0) {
GV->getType()->getAddressSpace() == 0 &&
!GV->isExternallyInitialized() &&
GS.AccessingFunction->doesNotRecurse() &&
isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) {
DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
->getEntryBlock().begin());

View File

@ -0,0 +1,80 @@
; RUN: opt -globalopt -S < %s | FileCheck %s
@G1 = internal global i32 5
@G2 = internal global i32 5
@G3 = internal global i32 5
@G4 = internal global i32 5
@G5 = internal global i32 5
; CHECK-LABEL: @test1
define internal i32 @test1() norecurse {
; CHECK-NOT: @G1
store i32 4, i32* @G1
%a = load i32, i32* @G1
; CHECK: ret
ret i32 %a
}
; The load comes before the store which makes @G2 live before the call.
; CHECK-LABEL: @test2
define internal i32 @test2() norecurse {
; CHECK-NOT: %G2
%a = load i32, i32* @G2
store i32 4, i32* @G2
; CHECK: ret
ret i32 %a
}
; This global is indexed by a GEP - this makes it partial alias and we bail out.
; FIXME: We don't actually have to bail out in this case.
; CHECK-LABEL: @test3
define internal i32 @test3() norecurse {
; CHECK-NOT: %G3
%x = getelementptr i32,i32* @G3, i32 0
%a = load i32, i32* %x
store i32 4, i32* @G3
; CHECK: ret
ret i32 %a
}
; The global is casted away to a larger type then loaded. The store only partially
; covers the load, so we must not demote.
; CHECK-LABEL: @test4
define internal i32 @test4() norecurse {
; CHECK-NOT: %G4
store i32 4, i32* @G4
%x = bitcast i32* @G4 to i64*
%a = load i64, i64* %x
%b = trunc i64 %a to i32
; CHECK: ret
ret i32 %b
}
; The global is casted away to a smaller type then loaded. This one is fine.
; CHECK-LABEL: @test5
define internal i32 @test5() norecurse {
; CHECK-NOT: @G5
store i32 4, i32* @G5
%x = bitcast i32* @G5 to i16*
%a = load i16, i16* %x
%b = zext i16 %a to i32
; CHECK: ret
ret i32 %b
}
define i32 @main() norecurse {
%a = call i32 @test1()
%b = call i32 @test2()
%c = call i32 @test3()
%d = call i32 @test4()
%e = call i32 @test5()
%x = or i32 %a, %b
%y = or i32 %x, %c
%z = or i32 %y, %d
%w = or i32 %z, %e
ret i32 %w
}

View File

@ -5,7 +5,7 @@
; to that containing %G should likewise drop to null.
@G = internal global i8** null
define i32 @main(i32 %argc, i8** %argv) {
define i32 @main(i32 %argc, i8** %argv) norecurse {
; CHECK-LABEL: @main(
; CHECK: %G = alloca
store i8** %argv, i8*** @G

View File

@ -11,7 +11,7 @@
@G = internal global i8** null
@G2 = internal global i8** null
define i32 @main(i32 %argc, i8** %argv) {
define i32 @main(i32 %argc, i8** %argv) norecurse {
; CHECK: alloca
store i8** %argv, i8*** @G
ret i32 0