[dsymutil] Reduce peak memory usage for the single threaded execution.

Keeping the compile units in memory is expensive. For the single threaded case we allocate them in the analyze part and deallocate them again once we've finished cloning. This poses a problem in the single threaded case where we did all the analysis first followed by all the cloning. This meant we had all the link context in memory right after analyzing finished. This patch changes the way we order work in the single threaded case. Instead of doing all the analysis and cloning in serial, we now interleave the two so we can deallocate the memory as soon as a file is processed. The result is binary identical and peak memory usage went down from 13.43GB to 5.73GB for a debug build of trunk clang. Differential revision: https://reviews.llvm.org/D51618 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@341568 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-15 08:19:51 +00:00 · 2018-09-06 17:31:59 +00:00 · 2018-09-06 17:31:59 +00:00 · 9840d7c8db
commit 9840d7c8db
parent bd42453404
1 changed files with 107 additions and 97 deletions
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@ -2483,50 +2483,41 @@ bool DwarfLinker::link(const DebugMap &Map) {
  std::condition_variable ProcessedFilesConditionVariable;
  BitVector ProcessedFiles(NumObjects, false);

-  // Now do analyzeContextInfo in parallel as it is particularly expensive.
-  auto AnalyzeLambda = [&]() {
-    for (unsigned i = 0, e = NumObjects; i != e; ++i) {
-      auto &LinkContext = ObjectContexts[i];
+  //  Analyzing the context info is particularly expensive so it is executed in
+  //  parallel with emitting the previous compile unit.
+  auto AnalyzeLambda = [&](size_t i) {
+    auto &LinkContext = ObjectContexts[i];

-      if (!LinkContext.ObjectFile || !LinkContext.DwarfContext) {
-        std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
-        ProcessedFiles.set(i);
-        ProcessedFilesConditionVariable.notify_one();
+    if (!LinkContext.ObjectFile || !LinkContext.DwarfContext)
+      return;
+
+    for (const auto &CU : LinkContext.DwarfContext->compile_units()) {
+      updateDwarfVersion(CU->getVersion());
+      // The !registerModuleReference() condition effectively skips
+      // over fully resolved skeleton units. This second pass of
+      // registerModuleReferences doesn't do any new work, but it
+      // will collect top-level errors, which are suppressed. Module
+      // warnings were already displayed in the first iteration.
+      bool Quiet = true;
+      auto CUDie = CU->getUnitDIE(false);
+      if (!CUDie || LLVM_UNLIKELY(Options.Update) ||
+          !registerModuleReference(CUDie, *CU, ModuleMap, LinkContext.DMO,
+                                   LinkContext.Ranges, OffsetsStringPool,
+                                   UniquingStringPool, ODRContexts, UnitID,
+                                   Quiet)) {
+        LinkContext.CompileUnits.push_back(llvm::make_unique<CompileUnit>(
+            *CU, UnitID++, !Options.NoODR && !Options.Update, ""));
+      }
+    }
+
+    // Now build the DIE parent links that we will use during the next phase.
+    for (auto &CurrentUnit : LinkContext.CompileUnits) {
+      auto CUDie = CurrentUnit->getOrigUnit().getUnitDIE();
+      if (!CUDie)
        continue;
-      }
-
-      for (const auto &CU : LinkContext.DwarfContext->compile_units()) {
-        updateDwarfVersion(CU->getVersion());
-        // The !registerModuleReference() condition effectively skips
-        // over fully resolved skeleton units. This second pass of
-        // registerModuleReferences doesn't do any new work, but it
-        // will collect top-level errors, which are suppressed. Module
-        // warnings were already displayed in the first iteration.
-        bool Quiet = true;
-        auto CUDie = CU->getUnitDIE(false);
-        if (!CUDie || LLVM_UNLIKELY(Options.Update) ||
-            !registerModuleReference(CUDie, *CU, ModuleMap, LinkContext.DMO,
-                                     LinkContext.Ranges, OffsetsStringPool,
-                                     UniquingStringPool, ODRContexts, UnitID,
-                                     Quiet)) {
-          LinkContext.CompileUnits.push_back(llvm::make_unique<CompileUnit>(
-              *CU, UnitID++, !Options.NoODR && !Options.Update, ""));
-        }
-      }
-      
-      // Now build the DIE parent links that we will use during the next phase.
-      for (auto &CurrentUnit : LinkContext.CompileUnits) {
-        auto CUDie = CurrentUnit->getOrigUnit().getUnitDIE();
-        if (!CUDie)
-          continue;
-        analyzeContextInfo(CurrentUnit->getOrigUnit().getUnitDIE(), 0,
-                           *CurrentUnit, &ODRContexts.getRoot(),
-                           UniquingStringPool, ODRContexts);
-      }
-
-      std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
-      ProcessedFiles.set(i);
-      ProcessedFilesConditionVariable.notify_one();
+      analyzeContextInfo(CurrentUnit->getOrigUnit().getUnitDIE(), 0,
+                         *CurrentUnit, &ODRContexts.getRoot(),
+                         UniquingStringPool, ODRContexts);
    }
  };

@ -2534,57 +2525,48 @@ bool DwarfLinker::link(const DebugMap &Map) {
  // Note, although this loop runs in serial, it can run in parallel with
  // the analyzeContextInfo loop so long as we process files with indices >=
  // than those processed by analyzeContextInfo.
-  auto CloneLambda = [&]() {
-    for (unsigned i = 0, e = NumObjects; i != e; ++i) {
-      {
-        std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
-        if (!ProcessedFiles[i]) {
-          ProcessedFilesConditionVariable.wait(
-              LockGuard, [&]() { return ProcessedFiles[i]; });
-        }
-      }
+  auto CloneLambda = [&](size_t i) {
+    auto &LinkContext = ObjectContexts[i];
+    if (!LinkContext.ObjectFile)
+      return;

-      auto &LinkContext = ObjectContexts[i];
-      if (!LinkContext.ObjectFile)
-        continue;
-
-      // Then mark all the DIEs that need to be present in the linked output
-      // and collect some information about them.
-      // Note that this loop can not be merged with the previous one because
-      // cross-cu references require the ParentIdx to be setup for every CU in
-      // the object file before calling this.
-      if (LLVM_UNLIKELY(Options.Update)) {
-        for (auto &CurrentUnit : LinkContext.CompileUnits)
-          CurrentUnit->markEverythingAsKept();
-        Streamer->copyInvariantDebugSection(*LinkContext.ObjectFile);
-      } else {
-        for (auto &CurrentUnit : LinkContext.CompileUnits)
-          lookForDIEsToKeep(LinkContext.RelocMgr, LinkContext.Ranges,
-                            LinkContext.CompileUnits,
-                            CurrentUnit->getOrigUnit().getUnitDIE(),
-                            LinkContext.DMO, *CurrentUnit, 0);
-      }
-
-      // The calls to applyValidRelocs inside cloneDIE will walk the reloc
-      // array again (in the same way findValidRelocsInDebugInfo() did). We
-      // need to reset the NextValidReloc index to the beginning.
-      LinkContext.RelocMgr.resetValidRelocs();
-      if (LinkContext.RelocMgr.hasValidRelocs() ||
-          LLVM_UNLIKELY(Options.Update))
-        DIECloner(*this, LinkContext.RelocMgr, DIEAlloc,
-                  LinkContext.CompileUnits, Options)
-            .cloneAllCompileUnits(*LinkContext.DwarfContext, LinkContext.DMO,
-                                  LinkContext.Ranges, OffsetsStringPool);
-      if (!Options.NoOutput && !LinkContext.CompileUnits.empty() &&
-          LLVM_LIKELY(!Options.Update))
-        patchFrameInfoForObject(
-            LinkContext.DMO, LinkContext.Ranges, *LinkContext.DwarfContext,
-            LinkContext.CompileUnits[0]->getOrigUnit().getAddressByteSize());
-
-      // Clean-up before starting working on the next object.
-      endDebugObject(LinkContext);
+    // Then mark all the DIEs that need to be present in the linked output
+    // and collect some information about them.
+    // Note that this loop can not be merged with the previous one because
+    // cross-cu references require the ParentIdx to be setup for every CU in
+    // the object file before calling this.
+    if (LLVM_UNLIKELY(Options.Update)) {
+      for (auto &CurrentUnit : LinkContext.CompileUnits)
+        CurrentUnit->markEverythingAsKept();
+      Streamer->copyInvariantDebugSection(*LinkContext.ObjectFile);
+    } else {
+      for (auto &CurrentUnit : LinkContext.CompileUnits)
+        lookForDIEsToKeep(LinkContext.RelocMgr, LinkContext.Ranges,
+                          LinkContext.CompileUnits,
+                          CurrentUnit->getOrigUnit().getUnitDIE(),
+                          LinkContext.DMO, *CurrentUnit, 0);
    }

+    // The calls to applyValidRelocs inside cloneDIE will walk the reloc
+    // array again (in the same way findValidRelocsInDebugInfo() did). We
+    // need to reset the NextValidReloc index to the beginning.
+    LinkContext.RelocMgr.resetValidRelocs();
+    if (LinkContext.RelocMgr.hasValidRelocs() || LLVM_UNLIKELY(Options.Update))
+      DIECloner(*this, LinkContext.RelocMgr, DIEAlloc, LinkContext.CompileUnits,
+                Options)
+          .cloneAllCompileUnits(*LinkContext.DwarfContext, LinkContext.DMO,
+                                LinkContext.Ranges, OffsetsStringPool);
+    if (!Options.NoOutput && !LinkContext.CompileUnits.empty() &&
+        LLVM_LIKELY(!Options.Update))
+      patchFrameInfoForObject(
+          LinkContext.DMO, LinkContext.Ranges, *LinkContext.DwarfContext,
+          LinkContext.CompileUnits[0]->getOrigUnit().getAddressByteSize());
+
+    // Clean-up before starting working on the next object.
+    endDebugObject(LinkContext);
+  };
+
+  auto EmitLambda = [&]() {
    // Emit everything that's global.
    if (!Options.NoOutput) {
      Streamer->emitAbbrevs(Abbreviations, MaxDwarfVersion);
@ -2606,16 +2588,44 @@ bool DwarfLinker::link(const DebugMap &Map) {
    }
  };

-  // FIXME: The DwarfLinker can have some very deep recursion that can max
-  // out the (significantly smaller) stack when using threads. We don't
-  // want this limitation when we only have a single thread.
+  auto AnalyzeAll = [&]() {
+    for (unsigned i = 0, e = NumObjects; i != e; ++i) {
+      AnalyzeLambda(i);
+
+      std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
+      ProcessedFiles.set(i);
+      ProcessedFilesConditionVariable.notify_one();
+    }
+  };
+
+  auto CloneAll = [&]() {
+    for (unsigned i = 0, e = NumObjects; i != e; ++i) {
+      {
+        std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
+        if (!ProcessedFiles[i]) {
+          ProcessedFilesConditionVariable.wait(
+              LockGuard, [&]() { return ProcessedFiles[i]; });
+        }
+      }
+
+      CloneLambda(i);
+    }
+    EmitLambda();
+  };
+
+  // To limit memory usage in the single threaded case, analyze and clone are
+  // run sequentially so the LinkContext is freed after processing each object
+  // in endDebugObject.
  if (Options.Threads == 1) {
-    AnalyzeLambda();
-    CloneLambda();
+    for (unsigned i = 0, e = NumObjects; i != e; ++i) {
+      AnalyzeLambda(i);
+      CloneLambda(i);
+    }
+    EmitLambda();
  } else {
    ThreadPool pool(2);
-    pool.async(AnalyzeLambda);
-    pool.async(CloneLambda);
+    pool.async(AnalyzeAll);
+    pool.async(CloneAll);
    pool.wait();
  }