llvm-capstone/lld/MachO/ExportTrie.cpp
Jez Ng 7bbdbacd00 [lld-macho] Use export trie instead of symtab when linking against dylibs
Summary:
This allows us to link against stripped dylibs. Moreover, it's simply
more correct: The symbol table includes symbols that the dylib uses but
doesn't export.

This temporarily regresses our ability to do lazy symbol binding because
dyld_stub_binder isn't in libSystem's export trie. Rather, it is in one
of the sub-libraries libSystem re-exports. (This doesn't affect our
tests since we are mocking out dyld_stub_binder there.) A follow-up diff
will address this by adding support for sub-libraries.

Depends on D79114.

Reviewers: ruiu, pcc, MaskRay, smeenai, alexshap, gkm, Ktwu, christylee

Subscribers: mgorny, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79226
2020-05-09 20:56:22 -07:00

290 lines
8.6 KiB
C++

//===- ExportTrie.cpp -----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a partial implementation of the Mach-O export trie format. It's
// essentially a symbol table encoded as a compressed prefix trie, meaning that
// the common prefixes of each symbol name are shared for a more compact
// representation. The prefixes are stored on the edges of the trie, and one
// edge can represent multiple characters. For example, given two exported
// symbols _bar and _baz, we will have a trie like this (terminal nodes are
// marked with an asterisk):
//
// +-+-+
// | | // root node
// +-+-+
// |
// | _ba
// |
// +-+-+
// | |
// +-+-+
// r / \ z
// / \
// +-+-+ +-+-+
// | * | | * |
// +-+-+ +-+-+
//
// More documentation of the format can be found in
// llvm/tools/obj2yaml/macho2yaml.cpp.
//
//===----------------------------------------------------------------------===//
#include "ExportTrie.h"
#include "Symbols.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "llvm/ADT/Optional.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/Support/LEB128.h"
using namespace llvm;
using namespace llvm::MachO;
using namespace lld;
using namespace lld::macho;
namespace {
struct Edge {
Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
StringRef substring;
struct TrieNode *child;
};
struct ExportInfo {
uint64_t address;
// TODO: Add proper support for re-exports & stub-and-resolver flags.
};
} // namespace
namespace lld {
namespace macho {
struct TrieNode {
std::vector<Edge> edges;
Optional<ExportInfo> info;
// Estimated offset from the start of the serialized trie to the current node.
// This will converge to the true offset when updateOffset() is run to a
// fixpoint.
size_t offset = 0;
// Returns whether the new estimated offset differs from the old one.
bool updateOffset(size_t &nextOffset);
void writeTo(uint8_t *buf) const;
};
bool TrieNode::updateOffset(size_t &nextOffset) {
// Size of the whole node (including the terminalSize and the outgoing edges.)
// In contrast, terminalSize only records the size of the other data in the
// node.
size_t nodeSize;
if (info) {
uint64_t flags = 0;
uint32_t terminalSize =
getULEB128Size(flags) + getULEB128Size(info->address);
// Overall node size so far is the uleb128 size of the length of the symbol
// info + the symbol info itself.
nodeSize = terminalSize + getULEB128Size(terminalSize);
} else {
nodeSize = 1; // Size of terminalSize (which has a value of 0)
}
// Compute size of all child edges.
++nodeSize; // Byte for number of children.
for (Edge &edge : edges) {
nodeSize += edge.substring.size() + 1 // String length.
+ getULEB128Size(edge.child->offset); // Offset len.
}
// On input, 'nextOffset' is the new preferred location for this node.
bool result = (offset != nextOffset);
// Store new location in node object for use by parents.
offset = nextOffset;
nextOffset += nodeSize;
return result;
}
void TrieNode::writeTo(uint8_t *buf) const {
buf += offset;
if (info) {
// TrieNodes with Symbol info: size, flags address
uint64_t flags = 0; // TODO: emit proper flags
uint32_t terminalSize =
getULEB128Size(flags) + getULEB128Size(info->address);
buf += encodeULEB128(terminalSize, buf);
buf += encodeULEB128(flags, buf);
buf += encodeULEB128(info->address, buf);
} else {
// TrieNode with no Symbol info.
*buf++ = 0; // terminalSize
}
// Add number of children. TODO: Handle case where we have more than 256.
assert(edges.size() < 256);
*buf++ = edges.size();
// Append each child edge substring and node offset.
for (const Edge &edge : edges) {
memcpy(buf, edge.substring.data(), edge.substring.size());
buf += edge.substring.size();
*buf++ = '\0';
buf += encodeULEB128(edge.child->offset, buf);
}
}
TrieNode *TrieBuilder::makeNode() {
auto *node = make<TrieNode>();
nodes.emplace_back(node);
return node;
}
static int charAt(const Symbol *sym, size_t pos) {
StringRef str = sym->getName();
if (pos >= str.size())
return -1;
return str[pos];
}
// Build the trie by performing a three-way radix quicksort: We start by sorting
// the strings by their first characters, then sort the strings with the same
// first characters by their second characters, and so on recursively. Each
// time the prefixes diverge, we add a node to the trie.
//
// node: The most recently created node along this path in the trie (i.e.
// the furthest from the root.)
// lastPos: The prefix length of the most recently created node, i.e. the number
// of characters along its path from the root.
// pos: The string index we are currently sorting on. Note that each symbol
// S contained in vec has the same prefix S[0...pos).
void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
TrieNode *node, size_t lastPos, size_t pos) {
tailcall:
if (vec.empty())
return;
// Partition items so that items in [0, i) are less than the pivot,
// [i, j) are the same as the pivot, and [j, vec.size()) are greater than
// the pivot.
const Symbol *pivotSymbol = vec[vec.size() / 2];
int pivot = charAt(pivotSymbol, pos);
size_t i = 0;
size_t j = vec.size();
for (size_t k = 0; k < j;) {
int c = charAt(vec[k], pos);
if (c < pivot)
std::swap(vec[i++], vec[k++]);
else if (c > pivot)
std::swap(vec[--j], vec[k]);
else
k++;
}
bool isTerminal = pivot == -1;
bool prefixesDiverge = i != 0 || j != vec.size();
if (lastPos != pos && (isTerminal || prefixesDiverge)) {
TrieNode *newNode = makeNode();
node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
newNode);
node = newNode;
lastPos = pos;
}
sortAndBuild(vec.slice(0, i), node, lastPos, pos);
sortAndBuild(vec.slice(j), node, lastPos, pos);
if (isTerminal) {
assert(j - i == 1); // no duplicate symbols
node->info = {pivotSymbol->getVA()};
} else {
// This is the tail-call-optimized version of the following:
// sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
vec = vec.slice(i, j - i);
++pos;
goto tailcall;
}
}
size_t TrieBuilder::build() {
if (exported.empty())
return 0;
TrieNode *root = makeNode();
sortAndBuild(exported, root, 0, 0);
// Assign each node in the vector an offset in the trie stream, iterating
// until all uleb128 sizes have stabilized.
size_t offset;
bool more;
do {
offset = 0;
more = false;
for (TrieNode *node : nodes)
more |= node->updateOffset(offset);
} while (more);
return offset;
}
void TrieBuilder::writeTo(uint8_t *buf) const {
for (TrieNode *node : nodes)
node->writeTo(buf);
}
namespace {
// Parse a serialized trie and invoke a callback for each entry.
class TrieParser {
public:
TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
: start(buf), end(start + size), callback(callback) {}
void parse(const uint8_t *buf, const Twine &cumulativeString);
void parse() { parse(start, ""); }
const uint8_t *start;
const uint8_t *end;
const TrieEntryCallback &callback;
};
} // namespace
void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
if (buf >= end)
fatal("Node offset points outside export section");
unsigned ulebSize;
uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
buf += ulebSize;
uint64_t flags = 0;
size_t offset;
if (terminalSize != 0) {
flags = decodeULEB128(buf, &ulebSize);
callback(cumulativeString, flags);
}
buf += terminalSize;
uint8_t numEdges = *buf++;
for (uint8_t i = 0; i < numEdges; ++i) {
const char *cbuf = reinterpret_cast<const char *>(buf);
StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
buf += substring.size() + 1;
offset = decodeULEB128(buf, &ulebSize);
buf += ulebSize;
parse(start + offset, cumulativeString + substring);
}
}
void parseTrie(const uint8_t *buf, size_t size,
const TrieEntryCallback &callback) {
if (size == 0)
return;
TrieParser(buf, size, callback).parse();
}
} // namespace macho
} // namespace lld