[lld-macho][nfc] Give non-text ConcatOutputSections order-independent finalization

This diff is motivated by my work to add proper DWARF unwind support. As detailed in PR50956 functions that need DWARF unwind need to have compact unwind entries synthesized for them. These CU entries encode an offset within `__eh_frame` that points to the corresponding DWARF FDE. In order to encode this offset during `UnwindInfoSectionImpl::finalize()`, we need to first assign values to `InputSection::outSecOff` for each `__eh_frame` subsection. But `__eh_frame` is ordered after `__unwind_info` (according to ld64 at least), which puts us in a bit of a bind: `outSecOff` gets assigned during finalization, but `__eh_frame` is being finalized after `__unwind_info`. But it occurred to me that there's no real need for most ConcatOutputSections to be finalized sequentially. It's only necessary for text-containing ConcatOutputSections that may contain branch relocs which may need thunks. ConcatOutputSections containing other types of data can be finalized in any order. This diff moves the finalization logic for non-text sections into a separate `finalizeContents()` method. This method is called before section address assignment & unwind info finalization takes place. In theory we could call these `finalizeContents()` methods in parallel, but in practice it seems to be faster to do it all on the main thread. Reviewed By: #lld-macho, oontvoo Differential Revision: https://reviews.llvm.org/D123279
2025-02-13 13:45:16 +00:00 · 2022-04-07 18:13:27 -04:00 · 2022-04-07 18:13:27 -04:00 · b440c25742
commit b440c25742
parent 16cf9e6dad
3 changed files with 80 additions and 35 deletions
--- a/lld/MachO/ConcatOutputSection.cpp
+++ b/lld/MachO/ConcatOutputSection.cpp
@ -121,7 +121,7 @@ DenseMap<Symbol *, ThunkInfo> lld::macho::thunkMap;
 // instructions, whereas CISC (i.e., x86) generally doesn't. RISC only needs
 // thunks for programs so large that branch source & destination addresses
 // might differ more than the range of branch instruction(s).
-bool ConcatOutputSection::needsThunks() const {
+bool TextOutputSection::needsThunks() const {
  if (!target->usesThunks())
    return false;
  uint64_t isecAddr = addr;
@ -138,7 +138,7 @@ bool ConcatOutputSection::needsThunks() const {
      auto *sym = r.referent.get<Symbol *>();
      // Pre-populate the thunkMap and memoize call site counts for every
      // InputSection and ThunkInfo. We do this for the benefit of
-      // ConcatOutputSection::estimateStubsInRangeVA()
+      // estimateStubsInRangeVA().
      ThunkInfo &thunkInfo = thunkMap[sym];
      // Knowing ThunkInfo call site count will help us know whether or not we
      // might need to create more for this referent at the time we are
@ -154,7 +154,7 @@ bool ConcatOutputSection::needsThunks() const {
 // Since __stubs is placed after __text, we must estimate the address
 // beyond which stubs are within range of a simple forward branch.
 // This is called exactly once, when the last input section has been finalized.
-uint64_t ConcatOutputSection::estimateStubsInRangeVA(size_t callIdx) const {
+uint64_t TextOutputSection::estimateStubsInRangeVA(size_t callIdx) const {
  // Tally the functions which still have call sites remaining to process,
  // which yields the maximum number of thunks we might yet place.
  size_t maxPotentialThunks = 0;
@ -193,23 +193,24 @@ uint64_t ConcatOutputSection::estimateStubsInRangeVA(size_t callIdx) const {
  return stubsInRangeVA;
 }

-void ConcatOutputSection::finalize() {
-  uint64_t isecAddr = addr;
-  uint64_t isecFileOff = fileOff;
-  auto finalizeOne = [&](ConcatInputSection *isec) {
-    isecAddr = alignTo(isecAddr, isec->align);
-    isecFileOff = alignTo(isecFileOff, isec->align);
-    isec->outSecOff = isecAddr - addr;
-    isec->isFinal = true;
-    isecAddr += isec->getSize();
-    isecFileOff += isec->getFileSize();
-  };
+void ConcatOutputSection::finalizeOne(ConcatInputSection *isec) {
+  size = alignTo(size, isec->align);
+  fileSize = alignTo(fileSize, isec->align);
+  isec->outSecOff = size;
+  isec->isFinal = true;
+  size += isec->getSize();
+  fileSize += isec->getFileSize();
+}

+void ConcatOutputSection::finalizeContents() {
+  for (ConcatInputSection *isec : inputs)
+    finalizeOne(isec);
+}
+
+void TextOutputSection::finalize() {
  if (!needsThunks()) {
    for (ConcatInputSection *isec : inputs)
      finalizeOne(isec);
-    size = isecAddr - addr;
-    fileSize = isecFileOff - fileOff;
    return;
  }

@ -225,7 +226,7 @@ void ConcatOutputSection::finalize() {
  // Walk all sections in order. Finalize all sections that are less than
  // forwardBranchRange in front of it.
  // isecVA is the address of the current section.
-  // isecAddr is the start address of the first non-finalized section.
+  // addr + size is the start address of the first non-finalized section.

  // inputs[finalIdx] is for finalization (address-assignment)
  size_t finalIdx = 0;
@ -246,7 +247,7 @@ void ConcatOutputSection::finalize() {
    // from the current position to the position where the thunks are inserted
    // grows. So leave room for a bunch of thunks.
    unsigned slop = 256 * thunkSize;
-    while (finalIdx < endIdx && isecAddr + inputs[finalIdx]->getSize() <
+    while (finalIdx < endIdx && addr + size + inputs[finalIdx]->getSize() <
                                    isecVA + forwardBranchRange - slop)
      finalizeOne(inputs[finalIdx++]);

@ -307,7 +308,7 @@ void ConcatOutputSection::finalize() {
        }
      }
      // ... otherwise, create a new thunk.
-      if (isecAddr > highVA) {
+      if (addr + size > highVA) {
        // There were too many consecutive branch instructions for `slop`
        // above. If you hit this: For the current algorithm, just bumping up
        // slop above and trying again is probably simplest. (See also PR51578
@ -342,12 +343,11 @@ void ConcatOutputSection::finalize() {
      thunkInfo.sym->used = true;
      target->populateThunk(thunkInfo.isec, funcSym);
      finalizeOne(thunkInfo.isec);
+      fprintf(stderr, "%llx\n", thunkInfo.isec->outSecOff);
      thunks.push_back(thunkInfo.isec);
      ++thunkCount;
    }
  }
-  size = isecAddr - addr;
-  fileSize = isecFileOff - fileOff;

  log("thunks for " + parent->name + "," + name +
      ": funcs = " + std::to_string(thunkMap.size()) +
@ -358,6 +358,11 @@ void ConcatOutputSection::finalize() {
 }

 void ConcatOutputSection::writeTo(uint8_t *buf) const {
+  for (ConcatInputSection *isec : inputs)
+    isec->writeTo(buf + isec->outSecOff);
+}
+
+void TextOutputSection::writeTo(uint8_t *buf) const {
  // Merge input sections from thunk & ordinary vectors
  size_t i = 0, ie = inputs.size();
  size_t t = 0, te = thunks.size();
@ -402,8 +407,14 @@ ConcatOutputSection *
 ConcatOutputSection::getOrCreateForInput(const InputSection *isec) {
  NamePair names = maybeRenameSection({isec->getSegName(), isec->getName()});
  ConcatOutputSection *&osec = concatOutputSections[names];
-  if (!osec)
-    osec = make<ConcatOutputSection>(names.second);
+  if (!osec) {
+    if (isec->getSegName() == segment_names::text &&
+        isec->getName() != section_names::gccExceptTab &&
+        isec->getName() != section_names::ehFrame)
+      osec = make<TextOutputSection>(names.second);
+    else
+      osec = make<ConcatOutputSection>(names.second);
+  }
  return osec;
 }

--- a/lld/MachO/ConcatOutputSection.h
+++ b/lld/MachO/ConcatOutputSection.h
@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef LLD_MACHO_MERGED_OUTPUT_SECTION_H
-#define LLD_MACHO_MERGED_OUTPUT_SECTION_H
+#ifndef LLD_MACHO_CONCAT_OUTPUT_SECTION_H
+#define LLD_MACHO_CONCAT_OUTPUT_SECTION_H

 #include "InputSection.h"
 #include "OutputSection.h"
@ -24,7 +24,7 @@ class Defined;
 // files that are labeled with the same segment and section name. This class
 // contains all such sections and writes the data from each section sequentially
 // in the final binary.
-class ConcatOutputSection final : public OutputSection {
+class ConcatOutputSection : public OutputSection {
 public:
  explicit ConcatOutputSection(StringRef name)
      : OutputSection(ConcatKind, name) {}
@ -37,27 +37,46 @@ public:
  uint64_t getSize() const override { return size; }
  uint64_t getFileSize() const override { return fileSize; }

+  // Assign values to InputSection::outSecOff. In contrast to TextOutputSection,
+  // which does this in its implementation of `finalize()`, we can do this
+  // without `finalize()`'s sequential guarantees detailed in the block comment
+  // of `OutputSection::finalize()`.
+  virtual void finalizeContents();
+
  void addInput(ConcatInputSection *input);
-  void finalize() override;
-  bool needsThunks() const;
-  uint64_t estimateStubsInRangeVA(size_t callIdx) const;
-
  void writeTo(uint8_t *buf) const override;

-  std::vector<ConcatInputSection *> inputs;
-  std::vector<ConcatInputSection *> thunks;
-
  static bool classof(const OutputSection *sec) {
    return sec->kind() == ConcatKind;
  }

  static ConcatOutputSection *getOrCreateForInput(const InputSection *);

-private:
-  void finalizeFlags(InputSection *input);
+  std::vector<ConcatInputSection *> inputs;

+protected:
  size_t size = 0;
  uint64_t fileSize = 0;
+  void finalizeOne(ConcatInputSection *);
+
+private:
+  void finalizeFlags(InputSection *input);
+};
+
+// ConcatOutputSections that contain code (text) require special handling to
+// support thunk insertion.
+class TextOutputSection : public ConcatOutputSection {
+public:
+  explicit TextOutputSection(StringRef name) : ConcatOutputSection(name) {}
+  void finalizeContents() override {}
+  void finalize() override;
+  bool needsThunks() const;
+  void writeTo(uint8_t *buf) const override;
+
+private:
+  uint64_t estimateStubsInRangeVA(size_t callIdx) const;
+
+  std::vector<ConcatInputSection *> thunks;
 };

 // We maintain one ThunkInfo per real function.
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@ -972,6 +972,21 @@ template <class LP> void Writer::createOutputSections() {
 void Writer::finalizeAddresses() {
  TimeTraceScope timeScope("Finalize addresses");
  uint64_t pageSize = target->getPageSize();
+
+  // We could parallelize this loop, but local benchmarking indicates it is
+  // faster to do it all in the main thread.
+  for (OutputSegment *seg : outputSegments) {
+    if (seg == linkEditSegment)
+      continue;
+    for (OutputSection *osec : seg->getSections()) {
+      if (!osec->isNeeded())
+        continue;
+      // Other kinds of OutputSections have already been finalized.
+      if (auto concatOsec = dyn_cast<ConcatOutputSection>(osec))
+          concatOsec->finalizeContents();
+    }
+  }
+
  // Ensure that segments (and the sections they contain) are allocated
  // addresses in ascending order, which dyld requires.
  //