Add an index for Module Metadata record in the bitcode

This index record the position for each metadata record in the bitcode, so that the reader will be able to lazy-load on demand each individual record. We also make sure that every abbrev is emitted upfront so that the block can be skipped while reading. I don't plan to commit this before having the reader counterpart, but I figured this can be reviewed mostly independently. Recommit r290684 (was reverted in r290686 because a test was broken) after adding a threshold to avoid emitting the index when unnecessary (little amount of metadata). This optimization "hides" a limitation of the ability to backpatch in the bitstream: we can only backpatch safely when the position has been flushed. So if we emit an index for one metadata, it is possible that (part of) the offset placeholder hasn't been flushed and the backpatch will fail. Differential Revision: https://reviews.llvm.org/D28083 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290690 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-19 02:42:58 +00:00 · 2016-12-28 22:30:28 +00:00 · 2016-12-28 22:30:28 +00:00 · 89bf9692cc
commit 89bf9692cc
parent 0ece61756a
9 changed files with 174 additions and 35 deletions
--- a/include/llvm/Bitcode/BitstreamWriter.h
+++ b/include/llvm/Bitcode/BitstreamWriter.h
@ -112,6 +112,11 @@ public:
        &Out[ByteNo], NewWord, BitNo & 7);
  }

+  void BackpatchWord64(uint64_t BitNo, uint64_t Val) {
+    BackpatchWord(BitNo, (uint32_t)Val);
+    BackpatchWord(BitNo + 32, (uint32_t)(Val >> 32));
+  }
+
  void Emit(uint32_t Val, unsigned NumBits) {
    assert(NumBits && NumBits <= 32 && "Invalid value size!");
    assert((Val & ~(~0U >> (32-NumBits))) == 0 && "High bits set!");
@ -279,7 +284,7 @@ private:
    default: llvm_unreachable("Unknown encoding!");
    case BitCodeAbbrevOp::Fixed:
      if (Op.getEncodingData())
-        Emit((unsigned)V, (unsigned)Op.getEncodingData());
+        Emit64(V, (unsigned)Op.getEncodingData());
      break;
    case BitCodeAbbrevOp::VBR:
      if (Op.getEncodingData())
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@ -218,30 +218,30 @@ enum GlobalValueSummarySymtabCodes {
 };

 enum MetadataCodes {
-  METADATA_STRING_OLD = 1,       // MDSTRING:      [values]
-  METADATA_VALUE = 2,            // VALUE:         [type num, value num]
-  METADATA_NODE = 3,             // NODE:          [n x md num]
-  METADATA_NAME = 4,             // STRING:        [values]
-  METADATA_DISTINCT_NODE = 5,    // DISTINCT_NODE: [n x md num]
-  METADATA_KIND = 6,             // [n x [id, name]]
-  METADATA_LOCATION = 7,         // [distinct, line, col, scope, inlined-at?]
-  METADATA_OLD_NODE = 8,         // OLD_NODE:      [n x (type num, value num)]
-  METADATA_OLD_FN_NODE = 9,      // OLD_FN_NODE:   [n x (type num, value num)]
-  METADATA_NAMED_NODE = 10,      // NAMED_NODE:    [n x mdnodes]
-  METADATA_ATTACHMENT = 11,      // [m x [value, [n x [id, mdnode]]]
-  METADATA_GENERIC_DEBUG = 12,   // [distinct, tag, vers, header, n x md num]
-  METADATA_SUBRANGE = 13,        // [distinct, count, lo]
-  METADATA_ENUMERATOR = 14,      // [distinct, value, name]
-  METADATA_BASIC_TYPE = 15,      // [distinct, tag, name, size, align, enc]
-  METADATA_FILE = 16,            // [distinct, filename, directory, checksumkind, checksum]
-  METADATA_DERIVED_TYPE = 17,    // [distinct, ...]
-  METADATA_COMPOSITE_TYPE = 18,  // [distinct, ...]
-  METADATA_SUBROUTINE_TYPE = 19, // [distinct, flags, types, cc]
-  METADATA_COMPILE_UNIT = 20,    // [distinct, ...]
-  METADATA_SUBPROGRAM = 21,      // [distinct, ...]
-  METADATA_LEXICAL_BLOCK = 22,   // [distinct, scope, file, line, column]
+  METADATA_STRING_OLD = 1,     // MDSTRING:      [values]
+  METADATA_VALUE = 2,          // VALUE:         [type num, value num]
+  METADATA_NODE = 3,           // NODE:          [n x md num]
+  METADATA_NAME = 4,           // STRING:        [values]
+  METADATA_DISTINCT_NODE = 5,  // DISTINCT_NODE: [n x md num]
+  METADATA_KIND = 6,           // [n x [id, name]]
+  METADATA_LOCATION = 7,       // [distinct, line, col, scope, inlined-at?]
+  METADATA_OLD_NODE = 8,       // OLD_NODE:      [n x (type num, value num)]
+  METADATA_OLD_FN_NODE = 9,    // OLD_FN_NODE:   [n x (type num, value num)]
+  METADATA_NAMED_NODE = 10,    // NAMED_NODE:    [n x mdnodes]
+  METADATA_ATTACHMENT = 11,    // [m x [value, [n x [id, mdnode]]]
+  METADATA_GENERIC_DEBUG = 12, // [distinct, tag, vers, header, n x md num]
+  METADATA_SUBRANGE = 13,      // [distinct, count, lo]
+  METADATA_ENUMERATOR = 14,    // [distinct, value, name]
+  METADATA_BASIC_TYPE = 15,    // [distinct, tag, name, size, align, enc]
+  METADATA_FILE = 16, // [distinct, filename, directory, checksumkind, checksum]
+  METADATA_DERIVED_TYPE = 17,       // [distinct, ...]
+  METADATA_COMPOSITE_TYPE = 18,     // [distinct, ...]
+  METADATA_SUBROUTINE_TYPE = 19,    // [distinct, flags, types, cc]
+  METADATA_COMPILE_UNIT = 20,       // [distinct, ...]
+  METADATA_SUBPROGRAM = 21,         // [distinct, ...]
+  METADATA_LEXICAL_BLOCK = 22,      // [distinct, scope, file, line, column]
  METADATA_LEXICAL_BLOCK_FILE = 23, //[distinct, scope, file, discriminator]
-  METADATA_NAMESPACE = 24,       // [distinct, scope, file, name, line, exportSymbols]
+  METADATA_NAMESPACE = 24, // [distinct, scope, file, name, line, exportSymbols]
  METADATA_TEMPLATE_TYPE = 25,   // [distinct, scope, name, type, ...]
  METADATA_TEMPLATE_VALUE = 26,  // [distinct, scope, name, type, value, ...]
  METADATA_GLOBAL_VAR = 27,      // [distinct, ...]
@ -254,7 +254,9 @@ enum MetadataCodes {
  METADATA_MACRO_FILE = 34,      // [distinct, macinfo, line, file, ...]
  METADATA_STRINGS = 35,         // [count, offset] blob([lengths][chars])
  METADATA_GLOBAL_DECL_ATTACHMENT = 36, // [valueid, n x [id, mdnode]]
-  METADATA_GLOBAL_VAR_EXPR = 37, // [distinct, var, expr]
+  METADATA_GLOBAL_VAR_EXPR = 37,        // [distinct, var, expr]
+  METADATA_INDEX_OFFSET = 38,           // [offset]
+  METADATA_INDEX = 39,                  // [bitpos]
 };

 // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@ -38,6 +38,11 @@
 using namespace llvm;

 namespace {
+
+cl::opt<unsigned>
+    IndexThreshold("bitcode-mdindex-threshold", cl::Hidden, cl::init(25),
+                   cl::desc("Number of metadatas above which we emit an index "
+                            "to enable lazy-loading"));
 /// These are manifest constants used by the bitcode writer. They do not need to
 /// be kept in sync with the reader, but need to be consistent within this file.
 enum {
@ -224,7 +229,9 @@ private:
  void writeMetadataStrings(ArrayRef<const Metadata *> Strings,
                            SmallVectorImpl<uint64_t> &Record);
  void writeMetadataRecords(ArrayRef<const Metadata *> MDs,
-                            SmallVectorImpl<uint64_t> &Record);
+                            SmallVectorImpl<uint64_t> &Record,
+                            std::vector<unsigned> *MDAbbrevs = nullptr,
+                            std::vector<uint64_t> *IndexPos = nullptr);
  void writeModuleMetadata();
  void writeFunctionMetadata(const Function &F);
  void writeFunctionMetadataAttachment(const Function &F);
@ -1854,8 +1861,16 @@ void ModuleBitcodeWriter::writeMetadataStrings(
  Record.clear();
 }

+// Generates an enum to use as an index in the Abbrev array of Metadata record.
+enum MetadataAbbrev : unsigned {
+#define HANDLE_MDNODE_LEAF(CLASS) CLASS##AbbrevID,
+#include "llvm/IR/Metadata.def"
+  LastPlusOne
+};
+
 void ModuleBitcodeWriter::writeMetadataRecords(
-    ArrayRef<const Metadata *> MDs, SmallVectorImpl<uint64_t> &Record) {
+    ArrayRef<const Metadata *> MDs, SmallVectorImpl<uint64_t> &Record,
+    std::vector<unsigned> *MDAbbrevs, std::vector<uint64_t> *IndexPos) {
  if (MDs.empty())
    return;

@ -1864,6 +1879,8 @@ void ModuleBitcodeWriter::writeMetadataRecords(
 #include "llvm/IR/Metadata.def"

  for (const Metadata *MD : MDs) {
+    if (IndexPos)
+      IndexPos->push_back(Stream.GetCurrentBitNo());
    if (const MDNode *N = dyn_cast<MDNode>(MD)) {
      assert(N->isResolved() && "Expected forward references to be resolved");

@ -1872,7 +1889,11 @@ void ModuleBitcodeWriter::writeMetadataRecords(
        llvm_unreachable("Invalid MDNode subclass");
 #define HANDLE_MDNODE_LEAF(CLASS)                                              \
  case Metadata::CLASS##Kind:                                                  \
-    write##CLASS(cast<CLASS>(N), Record, CLASS##Abbrev);                       \
+    if (MDAbbrevs)                                                             \
+      write##CLASS(cast<CLASS>(N), Record,                                     \
+                   (*MDAbbrevs)[MetadataAbbrev::CLASS##AbbrevID]);             \
+    else                                                                       \
+      write##CLASS(cast<CLASS>(N), Record, CLASS##Abbrev);                     \
    continue;
 #include "llvm/IR/Metadata.def"
      }
@ -1885,10 +1906,76 @@ void ModuleBitcodeWriter::writeModuleMetadata() {
  if (!VE.hasMDs() && M.named_metadata_empty())
    return;

-  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 4);
  SmallVector<uint64_t, 64> Record;
+
+  // Emit all abbrevs upfront, so that the reader can jump in the middle of the
+  // block and load any metadata.
+  std::vector<unsigned> MDAbbrevs;
+
+  MDAbbrevs.resize(MetadataAbbrev::LastPlusOne);
+  MDAbbrevs[MetadataAbbrev::DILocationAbbrevID] = createDILocationAbbrev();
+  MDAbbrevs[MetadataAbbrev::GenericDINodeAbbrevID] =
+      createGenericDINodeAbbrev();
+
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX_OFFSET));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 64));
+  unsigned OffsetAbbrev = Stream.EmitAbbrev(Abbv);
+
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  unsigned IndexAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Emit MDStrings together upfront.
  writeMetadataStrings(VE.getMDStrings(), Record);
-  writeMetadataRecords(VE.getNonMDStrings(), Record);
+
+  // We only emit an index for the metadata record if we have more than a given
+  // (naive) threshold of metadatas, otherwise it is not worth it.
+  if (VE.getNonMDStrings().size() > IndexThreshold) {
+    // Write a placeholder value in for the offset of the metadata index,
+    // which is written after the records, so that it can include
+    // the offset of each entry. The placeholder offset will be
+    // updated after all records are emitted.
+    uint64_t Vals[] = {0};
+    Stream.EmitRecord(bitc::METADATA_INDEX_OFFSET, Vals, OffsetAbbrev);
+  }
+
+  // Compute and save the bit offset to the current position, which will be
+  // patched when we emit the index later. We can simply subtract the 64-bit
+  // fixed size from the current bit number to get the location to backpatch.
+  uint64_t IndexOffsetRecordBitPos = Stream.GetCurrentBitNo();
+
+  // This index will contain the bitpos for each individual record.
+  std::vector<uint64_t> IndexPos;
+  IndexPos.reserve(VE.getNonMDStrings().size());
+
+  // Write all the records
+  writeMetadataRecords(VE.getNonMDStrings(), Record, &MDAbbrevs, &IndexPos);
+
+  if (VE.getNonMDStrings().size() > IndexThreshold) {
+    // Now that we have emitted all the records we will emit the index. But
+    // first
+    // backpatch the forward reference so that the reader can skip the records
+    // efficiently.
+    Stream.BackpatchWord64(IndexOffsetRecordBitPos - 64,
+                           Stream.GetCurrentBitNo() - IndexOffsetRecordBitPos);
+
+    // Delta encode the index.
+    uint64_t PreviousValue = IndexOffsetRecordBitPos;
+    for (auto &Elt : IndexPos) {
+      auto EltDelta = Elt - PreviousValue;
+      PreviousValue = Elt;
+      Elt = EltDelta;
+    }
+    // Emit the index record.
+    Stream.EmitRecord(bitc::METADATA_INDEX, IndexPos, IndexAbbrev);
+    IndexPos.clear();
+  }
+
+  // Write the named metadata now.
  writeNamedMetadata(Record);

  auto AddDeclAttachedMetadata = [&](const GlobalObject &GO) {
--- a/test/Bitcode/mdnodes-distinct-in-post-order.ll
+++ b/test/Bitcode/mdnodes-distinct-in-post-order.ll
@ -1,4 +1,5 @@
-; RUN: llvm-as <%s | llvm-bcanalyzer -dump | FileCheck %s
+; RUN: llvm-as <%s -bitcode-mdindex-threshold=0 | llvm-bcanalyzer -dump | FileCheck %s -check-prefix=CHECK  -check-prefix=MDINDEX
+; RUN: llvm-as <%s | llvm-bcanalyzer -dump | FileCheck %s -check-prefix=CHECK
 ; Check that distinct nodes are emitted in post-order to avoid unnecessary
 ; forward references.

@ -17,6 +18,11 @@
 ; CHECK-NEXT:  <DISTINCT_NODE op0=1 op1=3 op2=2/>
 !4 = distinct !{!1, !3, !2}

+; Before the named records we emit the index containing the position of the
+; previously emitted records, but only if we have a number of record above
+; a threshold (can be controlled through `-bitcode-mdindex-threshold`).
+; MDINDEX:  <INDEX {{.*}} (offset match)
+
 ; Note: named metadata nodes are not cannot reference null so their operands
 ; are numbered off-by-one.
 ; CHECK-NEXT:  <NAME
--- a/test/Bitcode/mdnodes-distinct-nodes-break-cycles.ll
+++ b/test/Bitcode/mdnodes-distinct-nodes-break-cycles.ll
@ -1,4 +1,4 @@
-; RUN: llvm-as <%s | llvm-bcanalyzer -dump | FileCheck %s
+; RUN: llvm-as <%s -bitcode-mdindex-threshold=0 | llvm-bcanalyzer -dump | FileCheck %s
 ; Check that distinct nodes break uniquing cycles, so that uniqued subgraphs
 ; are always in post-order.
 ;
@ -22,6 +22,10 @@
 ; CHECK-NEXT:  <NODE op0=2/>
 !3 = !{!2}

+; Before the named records we emit the index containing the position of the
+; previously emitted records
+; CHECK-NEXT:   <INDEX {{.*}} (offset match)
+
 ; Note: named metadata nodes are not cannot reference null so their operands
 ; are numbered off-by-one.
 ; CHECK-NEXT:  <NAME
--- a/test/Bitcode/mdnodes-distinct-nodes-first.ll
+++ b/test/Bitcode/mdnodes-distinct-nodes-first.ll
@ -1,4 +1,4 @@
-; RUN: llvm-as <%s | llvm-bcanalyzer -dump | FileCheck %s
+; RUN: llvm-as <%s -bitcode-mdindex-threshold=0 | llvm-bcanalyzer -dump | FileCheck %s
 ; Check that distinct nodes are emitted before uniqued nodes, even if that
 ; breaks post-order traversals.

@ -11,6 +11,10 @@
 ; CHECK-NEXT:  <NODE op0=1/>
 !2 = !{!1}

+; Before the named records we emit the index containing the position of the
+; previously emitted records
+; CHECK-NEXT:   <INDEX {{.*}} (offset match)
+
 ; Note: named metadata nodes are not cannot reference null so their operands
 ; are numbered off-by-one.
 ; CHECK-NEXT:  <NAME
--- a/test/Bitcode/mdnodes-in-post-order.ll
+++ b/test/Bitcode/mdnodes-in-post-order.ll
@ -1,4 +1,4 @@
-; RUN: llvm-as <%s | llvm-bcanalyzer -dump | FileCheck %s
+; RUN: llvm-as <%s -bitcode-mdindex-threshold=0 | llvm-bcanalyzer -dump | FileCheck %s
 ; Check that nodes are emitted in post-order to minimize the need for temporary
 ; nodes.  The graph structure is designed to foil naive implementations of
 ; iteratitive post-order traersals: the leaves, !3 and !4, are reachable from
@ -15,6 +15,9 @@
 ; CHECK-NEXT:    'leaf
 ; CHECK-NEXT:  }

+; Before the records we emit an offset to the index for the block
+; CHECK-NEXT:   <INDEX_OFFSET
+
 ; The leafs should come first (in either order).
 ; CHECK-NEXT:  <NODE op0=1/>
 ; CHECK-NEXT:  <NODE op0=2/>
@ -27,6 +30,10 @@
 ; CHECK-NEXT:  <NODE op0=3 op1=5 op2=4/>
 !6 = !{!3, !5, !4}

+; Before the named records we emit the index containing the position of the
+; previously emitted records
+; CHECK-NEXT:   <INDEX {{.*}} (offset match)
+
 ; Note: named metadata nodes are not cannot reference null so their operands
 ; are numbered off-by-one.
 ; CHECK-NEXT:  <NAME
--- a/test/Bitcode/metadata-function-blocks.ll
+++ b/test/Bitcode/metadata-function-blocks.ll
@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
+; RUN: llvm-as < %s -bitcode-mdindex-threshold=0 | llvm-bcanalyzer -dump | FileCheck %s
 ; Test that metadata only used by a single function is serialized in that
 ; function instead of in the global pool.
 ;
@ -19,6 +19,9 @@
 ; Each node gets a new number.  Bottom-up traversal of nodes.
 !named = !{!6}

+; Before the records we emit an offset to the index for the block
+; CHECK-NEXT:   <INDEX_OFFSET
+
 ; CHECK-NEXT:   <NODE op0=1/>
 !4 = !{!"named"}

--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@ -353,6 +353,8 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
      STRINGIFY_CODE(METADATA, OBJC_PROPERTY)
      STRINGIFY_CODE(METADATA, IMPORTED_ENTITY)
      STRINGIFY_CODE(METADATA, MODULE)
+      STRINGIFY_CODE(METADATA, INDEX_OFFSET)
+      STRINGIFY_CODE(METADATA, INDEX)
    }
  case bitc::METADATA_KIND_BLOCK_ID:
    switch (CodeID) {
@ -514,6 +516,9 @@ static bool ParseBlock(BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo,

  SmallVector<uint64_t, 64> Record;

+  // Keep the offset to the metadata index if seen.
+  uint64_t MetadataIndexOffset = 0;
+
  // Read all the records for this block.
  while (1) {
    if (Stream.AtEndOfStream())
@ -600,6 +605,22 @@ static bool ParseBlock(BitstreamCursor &Stream, BitstreamBlockInfo &BlockInfo,
      for (unsigned i = 0, e = Record.size(); i != e; ++i)
        outs() << " op" << i << "=" << (int64_t)Record[i];

+      // If we found a metadata index, let's verify that we had an offset before
+      // and validate its forward reference offset was correct!
+      if (BlockID == bitc::METADATA_BLOCK_ID) {
+        if (Code == bitc::METADATA_INDEX_OFFSET) {
+          MetadataIndexOffset = Stream.GetCurrentBitNo() + Record[0];
+        }
+        if (Code == bitc::METADATA_INDEX) {
+          outs() << " (offset ";
+          if (MetadataIndexOffset == RecordStartBit)
+            outs() << "match)";
+          else
+            outs() << "mismatch: " << MetadataIndexOffset << " vs "
+                   << RecordStartBit << ")";
+        }
+      }
+
      // If we found a module hash, let's verify that it matches!
      if (BlockID == bitc::MODULE_BLOCK_ID && Code == bitc::MODULE_CODE_HASH) {
        if (Record.size() != 5)