[dsymutil] Reduce peak memory usage for the single threaded execution.

Keeping the compile units in memory is expensive. For the single
threaded case we allocate them in the analyze part and deallocate them
again once we've finished cloning. This poses a problem in the single
threaded case where we did all the analysis first followed by all the
cloning. This meant we had all the link context in memory right after
analyzing finished.

This patch changes the way we order work in the single threaded case.
Instead of doing all the analysis and cloning in serial, we now
interleave the two so we can deallocate the memory as soon as a file is
processed. The result is binary identical and peak memory usage went
down from 13.43GB to 5.73GB for a debug build of trunk clang.

Differential revision: https://reviews.llvm.org/D51618

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@341568 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Jonas Devlieghere 2018-09-06 17:31:59 +00:00
parent bd42453404
commit 9840d7c8db

View File

@ -2483,50 +2483,41 @@ bool DwarfLinker::link(const DebugMap &Map) {
std::condition_variable ProcessedFilesConditionVariable;
BitVector ProcessedFiles(NumObjects, false);
// Now do analyzeContextInfo in parallel as it is particularly expensive.
auto AnalyzeLambda = [&]() {
for (unsigned i = 0, e = NumObjects; i != e; ++i) {
auto &LinkContext = ObjectContexts[i];
// Analyzing the context info is particularly expensive so it is executed in
// parallel with emitting the previous compile unit.
auto AnalyzeLambda = [&](size_t i) {
auto &LinkContext = ObjectContexts[i];
if (!LinkContext.ObjectFile || !LinkContext.DwarfContext) {
std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
ProcessedFiles.set(i);
ProcessedFilesConditionVariable.notify_one();
if (!LinkContext.ObjectFile || !LinkContext.DwarfContext)
return;
for (const auto &CU : LinkContext.DwarfContext->compile_units()) {
updateDwarfVersion(CU->getVersion());
// The !registerModuleReference() condition effectively skips
// over fully resolved skeleton units. This second pass of
// registerModuleReferences doesn't do any new work, but it
// will collect top-level errors, which are suppressed. Module
// warnings were already displayed in the first iteration.
bool Quiet = true;
auto CUDie = CU->getUnitDIE(false);
if (!CUDie || LLVM_UNLIKELY(Options.Update) ||
!registerModuleReference(CUDie, *CU, ModuleMap, LinkContext.DMO,
LinkContext.Ranges, OffsetsStringPool,
UniquingStringPool, ODRContexts, UnitID,
Quiet)) {
LinkContext.CompileUnits.push_back(llvm::make_unique<CompileUnit>(
*CU, UnitID++, !Options.NoODR && !Options.Update, ""));
}
}
// Now build the DIE parent links that we will use during the next phase.
for (auto &CurrentUnit : LinkContext.CompileUnits) {
auto CUDie = CurrentUnit->getOrigUnit().getUnitDIE();
if (!CUDie)
continue;
}
for (const auto &CU : LinkContext.DwarfContext->compile_units()) {
updateDwarfVersion(CU->getVersion());
// The !registerModuleReference() condition effectively skips
// over fully resolved skeleton units. This second pass of
// registerModuleReferences doesn't do any new work, but it
// will collect top-level errors, which are suppressed. Module
// warnings were already displayed in the first iteration.
bool Quiet = true;
auto CUDie = CU->getUnitDIE(false);
if (!CUDie || LLVM_UNLIKELY(Options.Update) ||
!registerModuleReference(CUDie, *CU, ModuleMap, LinkContext.DMO,
LinkContext.Ranges, OffsetsStringPool,
UniquingStringPool, ODRContexts, UnitID,
Quiet)) {
LinkContext.CompileUnits.push_back(llvm::make_unique<CompileUnit>(
*CU, UnitID++, !Options.NoODR && !Options.Update, ""));
}
}
// Now build the DIE parent links that we will use during the next phase.
for (auto &CurrentUnit : LinkContext.CompileUnits) {
auto CUDie = CurrentUnit->getOrigUnit().getUnitDIE();
if (!CUDie)
continue;
analyzeContextInfo(CurrentUnit->getOrigUnit().getUnitDIE(), 0,
*CurrentUnit, &ODRContexts.getRoot(),
UniquingStringPool, ODRContexts);
}
std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
ProcessedFiles.set(i);
ProcessedFilesConditionVariable.notify_one();
analyzeContextInfo(CurrentUnit->getOrigUnit().getUnitDIE(), 0,
*CurrentUnit, &ODRContexts.getRoot(),
UniquingStringPool, ODRContexts);
}
};
@ -2534,57 +2525,48 @@ bool DwarfLinker::link(const DebugMap &Map) {
// Note, although this loop runs in serial, it can run in parallel with
// the analyzeContextInfo loop so long as we process files with indices >=
// than those processed by analyzeContextInfo.
auto CloneLambda = [&]() {
for (unsigned i = 0, e = NumObjects; i != e; ++i) {
{
std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
if (!ProcessedFiles[i]) {
ProcessedFilesConditionVariable.wait(
LockGuard, [&]() { return ProcessedFiles[i]; });
}
}
auto CloneLambda = [&](size_t i) {
auto &LinkContext = ObjectContexts[i];
if (!LinkContext.ObjectFile)
return;
auto &LinkContext = ObjectContexts[i];
if (!LinkContext.ObjectFile)
continue;
// Then mark all the DIEs that need to be present in the linked output
// and collect some information about them.
// Note that this loop can not be merged with the previous one because
// cross-cu references require the ParentIdx to be setup for every CU in
// the object file before calling this.
if (LLVM_UNLIKELY(Options.Update)) {
for (auto &CurrentUnit : LinkContext.CompileUnits)
CurrentUnit->markEverythingAsKept();
Streamer->copyInvariantDebugSection(*LinkContext.ObjectFile);
} else {
for (auto &CurrentUnit : LinkContext.CompileUnits)
lookForDIEsToKeep(LinkContext.RelocMgr, LinkContext.Ranges,
LinkContext.CompileUnits,
CurrentUnit->getOrigUnit().getUnitDIE(),
LinkContext.DMO, *CurrentUnit, 0);
}
// The calls to applyValidRelocs inside cloneDIE will walk the reloc
// array again (in the same way findValidRelocsInDebugInfo() did). We
// need to reset the NextValidReloc index to the beginning.
LinkContext.RelocMgr.resetValidRelocs();
if (LinkContext.RelocMgr.hasValidRelocs() ||
LLVM_UNLIKELY(Options.Update))
DIECloner(*this, LinkContext.RelocMgr, DIEAlloc,
LinkContext.CompileUnits, Options)
.cloneAllCompileUnits(*LinkContext.DwarfContext, LinkContext.DMO,
LinkContext.Ranges, OffsetsStringPool);
if (!Options.NoOutput && !LinkContext.CompileUnits.empty() &&
LLVM_LIKELY(!Options.Update))
patchFrameInfoForObject(
LinkContext.DMO, LinkContext.Ranges, *LinkContext.DwarfContext,
LinkContext.CompileUnits[0]->getOrigUnit().getAddressByteSize());
// Clean-up before starting working on the next object.
endDebugObject(LinkContext);
// Then mark all the DIEs that need to be present in the linked output
// and collect some information about them.
// Note that this loop can not be merged with the previous one because
// cross-cu references require the ParentIdx to be setup for every CU in
// the object file before calling this.
if (LLVM_UNLIKELY(Options.Update)) {
for (auto &CurrentUnit : LinkContext.CompileUnits)
CurrentUnit->markEverythingAsKept();
Streamer->copyInvariantDebugSection(*LinkContext.ObjectFile);
} else {
for (auto &CurrentUnit : LinkContext.CompileUnits)
lookForDIEsToKeep(LinkContext.RelocMgr, LinkContext.Ranges,
LinkContext.CompileUnits,
CurrentUnit->getOrigUnit().getUnitDIE(),
LinkContext.DMO, *CurrentUnit, 0);
}
// The calls to applyValidRelocs inside cloneDIE will walk the reloc
// array again (in the same way findValidRelocsInDebugInfo() did). We
// need to reset the NextValidReloc index to the beginning.
LinkContext.RelocMgr.resetValidRelocs();
if (LinkContext.RelocMgr.hasValidRelocs() || LLVM_UNLIKELY(Options.Update))
DIECloner(*this, LinkContext.RelocMgr, DIEAlloc, LinkContext.CompileUnits,
Options)
.cloneAllCompileUnits(*LinkContext.DwarfContext, LinkContext.DMO,
LinkContext.Ranges, OffsetsStringPool);
if (!Options.NoOutput && !LinkContext.CompileUnits.empty() &&
LLVM_LIKELY(!Options.Update))
patchFrameInfoForObject(
LinkContext.DMO, LinkContext.Ranges, *LinkContext.DwarfContext,
LinkContext.CompileUnits[0]->getOrigUnit().getAddressByteSize());
// Clean-up before starting working on the next object.
endDebugObject(LinkContext);
};
auto EmitLambda = [&]() {
// Emit everything that's global.
if (!Options.NoOutput) {
Streamer->emitAbbrevs(Abbreviations, MaxDwarfVersion);
@ -2606,16 +2588,44 @@ bool DwarfLinker::link(const DebugMap &Map) {
}
};
// FIXME: The DwarfLinker can have some very deep recursion that can max
// out the (significantly smaller) stack when using threads. We don't
// want this limitation when we only have a single thread.
auto AnalyzeAll = [&]() {
for (unsigned i = 0, e = NumObjects; i != e; ++i) {
AnalyzeLambda(i);
std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
ProcessedFiles.set(i);
ProcessedFilesConditionVariable.notify_one();
}
};
auto CloneAll = [&]() {
for (unsigned i = 0, e = NumObjects; i != e; ++i) {
{
std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
if (!ProcessedFiles[i]) {
ProcessedFilesConditionVariable.wait(
LockGuard, [&]() { return ProcessedFiles[i]; });
}
}
CloneLambda(i);
}
EmitLambda();
};
// To limit memory usage in the single threaded case, analyze and clone are
// run sequentially so the LinkContext is freed after processing each object
// in endDebugObject.
if (Options.Threads == 1) {
AnalyzeLambda();
CloneLambda();
for (unsigned i = 0, e = NumObjects; i != e; ++i) {
AnalyzeLambda(i);
CloneLambda(i);
}
EmitLambda();
} else {
ThreadPool pool(2);
pool.async(AnalyzeLambda);
pool.async(CloneLambda);
pool.async(AnalyzeAll);
pool.async(CloneAll);
pool.wait();
}