[BOLT] Efficient edge profiling in instrumented mode

Summary:
Change our edge profiling technique when using instrumentation
to do not instrument every edge. Instead, build the spanning tree
for the CFG and omit instrumentation for edges in the spanning tree.
Infer the edge count for these edges when writing the profile during
run time. The inference works with a bottom-up traversal of the spanning
tree and establishes the value of the edge connecting to the parent based
on a simple flow equation involving output and input edges, where the
only unknown variable is the parent edge.

This requires some engineering in the runtime lib to support dynamic
allocation for building these graphs at runtime.

(cherry picked from FBD17062773)
This commit is contained in:
Rafael Auler 2019-08-07 16:09:50 -07:00 committed by Maksim Panchenko
parent 52786928ff
commit cc4b2fb614
9 changed files with 1118 additions and 225 deletions

View File

@ -8,5 +8,7 @@ project(libbolt_rt_project)
add_library(bolt_rt STATIC
instr.cpp
)
# Don't let the compiler think it can create calls to standard libs
target_compile_options(bolt_rt PRIVATE -ffreestanding)
install(TARGETS bolt_rt DESTINATION lib)

View File

@ -17,16 +17,35 @@
#include <cstdint>
#include <elf.h>
// All extern declarations here need to be defined by BOLT itself.
//#define ENABLE_DEBUG
#ifdef ENABLE_DEBUG
#define DEBUG(X) \
{ X; }
#else
#define DEBUG(X) \
{}
#endif
// All extern declarations here need to be defined by BOLT itself
// Counters inserted by instrumentation, incremented during runtime when
// points of interest (locations) in the program are reached.
extern uint64_t __bolt_instr_locations[];
// Number of counters.
extern uint32_t __bolt_instr_num_locs;
// Filename to dump data to.
// Number of call descriptions
extern uint32_t __bolt_instr_num_calls;
// Number of function descriptions
extern uint32_t __bolt_instr_num_funcs;
// Filename to dump data to
extern char __bolt_instr_filename[];
// Anonymous namespace covering everything but our library entry point
namespace {
// We use a stack-allocated buffer for string manipulation in some
// places with this size.
constexpr uint32_t BufSize = 10240;
// A location is a function name plus offset. Function name needs to be
// retrieved from the string table and is stored as an index to this table.
struct Location {
@ -34,30 +53,58 @@ struct Location {
uint32_t Offset;
};
struct CallDescription {
Location From;
Location To;
uint32_t Counter;
};
// An edge description defines an instrumented edge in the program, fully
// identified by where the jump is located and its destination.
struct EdgeDescription {
Location From;
uint32_t FromNode;
Location To;
uint32_t ToNode;
uint32_t Counter;
};
// This description is used for instrumented basic blocks. We only instrument
// blocks that are leaves of the spanning tree associated with the CFG. We use
// their execution count as a starting point to determine the frequency of the
// incoming edge in the spanning tree (the one that is not explictly
// instrumented). All other edges (not in the spanning tree) have an associated
// counter themselves. We never write a node's counter to the profile, it is
// only used for inference of other edges.
struct InstrumentedNode {
uint32_t Node;
uint32_t Counter;
};
// General metadata describing a function - number of CFG nodes, leaf nodes and
// edges.
struct FunctionDescription {
uint32_t NumNodes;
InstrumentedNode *LeafNodes;
uint32_t NumEdges;
EdgeDescription *Edges;
};
// These need to be read from disk. They are generated by BOLT and written to
// an ELF note section in the binary itself.
struct InstrumentationInfo {
EdgeDescription *Descriptions;
char *Strings; // String table with function names used in this binary
int FileDesc; // File descriptor for the file on disk backing this
// information in memory via mmap
uint8_t *MMapPtr; // The mmap ptr
int MMapSize; // The mmap size
CallDescription *CallDescriptions;
uint8_t *FuncDescriptions;
char *Strings; // String table with function names used in this binary
int FileDesc; // File descriptor for the file on disk backing this
// information in memory via mmap
void *MMapPtr; // The mmap ptr
int MMapSize; // The mmap size
};
// Declare some syscall wrappers we use throughout this code to avoid linking
// against system libc.
static uint64_t
__open(const char *pathname,
uint64_t flags,
uint64_t mode) {
uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
uint64_t ret;
__asm__ __volatile__ (
"movq $2, %%rax\n"
@ -68,7 +115,7 @@ __open(const char *pathname,
return ret;
}
static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
uint64_t ret;
__asm__ __volatile__ (
"movq $1, %%rax\n"
@ -79,7 +126,7 @@ static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
return ret;
}
static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
uint64_t ret;
__asm__ __volatile__ (
"movq $8, %%rax\n"
@ -90,7 +137,7 @@ static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
return ret;
}
static int __close(uint64_t fd) {
int __close(uint64_t fd) {
uint64_t ret;
__asm__ __volatile__ (
"movq $3, %%rax\n"
@ -101,8 +148,8 @@ static int __close(uint64_t fd) {
return ret;
}
static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
uint64_t flags, uint64_t fd, uint64_t offset) {
void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
uint64_t fd, uint64_t offset) {
void *ret;
register uint64_t r8 asm("r8") = fd;
register uint64_t r9 asm("r9") = offset;
@ -116,7 +163,7 @@ static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
return ret;
}
static uint64_t __munmap(void *addr, uint64_t size) {
uint64_t __munmap(void *addr, uint64_t size) {
uint64_t ret;
__asm__ __volatile__ (
"movq $11, %%rax\n"
@ -127,7 +174,7 @@ static uint64_t __munmap(void *addr, uint64_t size) {
return ret;
}
static uint64_t __exit(uint64_t code) {
uint64_t __exit(uint64_t code) {
uint64_t ret;
__asm__ __volatile__ (
"movq $231, %%rax\n"
@ -142,9 +189,9 @@ static uint64_t __exit(uint64_t code) {
// Write number Num using Base to the buffer in OutBuf, returns a pointer to
// the end of the string.
static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) {
const char *Chars = "0123456789abcdef";
char Buf[20];
char Buf[21];
char *Ptr = Buf;
while (Num) {
*Ptr++ = *(Chars + (Num % Base));
@ -160,22 +207,147 @@ static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
return OutBuf;
}
// Copy Str to OutBuf, returns a pointer to the end of the copied string.
static char *strCopy(char *OutBuf, const char *Str) {
while (*Str)
// Copy Str to OutBuf, returns a pointer to the end of the copied string
char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) {
while (*Str) {
*OutBuf++ = *Str++;
if (--Size <= 0)
return OutBuf;
}
return OutBuf;
}
// Print Msg to STDERR and quits with error code 1.
static void reportError(const char *Msg, uint64_t Size) {
void memSet(char *Buf, char C, uint32_t Size) {
for (int I = 0; I < Size; ++I)
*Buf++ = C;
}
uint32_t strLen(const char *Str) {
uint32_t Size = 0;
while (*Str++)
++Size;
return Size;
}
// Print Msg to STDERR and quits with error code 1
void reportError(const char *Msg, uint64_t Size) {
__write(2, Msg, Size);
__exit(1);
}
void assert(bool Assertion, const char *Msg) {
if (Assertion)
return;
char Buf[BufSize];
char *Ptr = Buf;
Ptr = strCopy(Ptr, "Assertion failed: ");
Ptr = strCopy(Ptr, Msg, BufSize - 40);
Ptr = strCopy(Ptr, "\n");
reportError(Buf, Ptr - Buf);
}
void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) {
char Buf[BufSize];
char *Ptr = Buf;
Ptr = strCopy(Ptr, Msg, BufSize - 23);
Ptr = intToStr(Ptr, Num, Base);
Ptr = strCopy(Ptr, "\n");
__write(2, Buf, Ptr - Buf);
}
void report(const char *Msg) {
__write(2, Msg, strLen(Msg));
}
inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
return (Value + Align - 1) / Align * Align;
}
/// A simple allocator that mmaps a fixed size region and manages this space
/// in a stack fashion, meaning you always deallocate the last element that
/// was allocated.
class BumpPtrAllocator {
struct EntryMetadata {
uint64_t Magic;
uint64_t AllocSize;
};
public:
void *allocate(uintptr_t Size) {
if (StackBase == nullptr) {
StackBase = reinterpret_cast<uint8_t *>(__mmap(
0, MAXSIZE, 0x3 /* PROT_READ | PROT_WRITE*/,
0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0));
StackSize = 0;
}
Size = alignTo(Size + sizeof(EntryMetadata), 16);
uint8_t * AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
M->Magic = MAGIC;
M->AllocSize = Size;
StackSize += Size;
assert(StackSize < MAXSIZE, "allocator ran out of memory");
return AllocAddress;
}
void deallocate(void *Ptr) {
uint8_t MetadataOffset = sizeof(EntryMetadata);
auto *M = reinterpret_cast<EntryMetadata *>(
reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
// Validate size
if (Ptr != StackTop - M->AllocSize) {
// This could be a pointer returned by operator new []
MetadataOffset +=
sizeof(uint64_t); // Space for number of elements alloc'ed
M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
MetadataOffset);
assert(Ptr == StackTop - M->AllocSize,
"must deallocate the last element alloc'ed");
}
assert(M->Magic == MAGIC, "allocator magic is corrupt");
StackSize -= M->AllocSize;
}
private:
static constexpr uint64_t MAGIC = 0x1122334455667788ull;
static constexpr uint64_t MAXSIZE = 0xa00000;
uint8_t *StackBase{nullptr};
uint64_t StackSize{0};
uint8_t *LastAlloc{nullptr};
};
} // anonymous namespace
// User-defined placement new operators. We only use those (as opposed to
// overriding the regular operator new) so we can keep our allocator in the
// stack instead of in a data section (global).
void *operator new(uintptr_t Sz, BumpPtrAllocator &A) {
return A.allocate(Sz);
}
void *operator new (uintptr_t Sz, BumpPtrAllocator &A, char C) {
auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
memSet(Ptr, C, Sz);
return Ptr;
}
void *operator new [] (uintptr_t Sz, BumpPtrAllocator &A) {
return A.allocate(Sz);
}
void *operator new [] (uintptr_t Sz, BumpPtrAllocator &A, char C) {
auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
memSet(Ptr, C, Sz);
return Ptr;
}
// Only called during exception unwinding (useless). We must manually dealloc.
// C++ language weirdness
void operator delete(void *Ptr, BumpPtrAllocator &A) {
A.deallocate(Ptr);
}
namespace {
// Perform a string comparison and returns zero if Str1 matches Str2. Compares
// at most Size characters.
static int compareStr(const char *Str1, const char *Str2, int Size) {
int compareStr(const char *Str1, const char *Str2, int Size) {
while (*Str1 == *Str2) {
if (*Str1 == '\0' || --Size == 0)
return 0;
@ -187,15 +359,19 @@ static int compareStr(const char *Str1, const char *Str2, int Size) {
// Write as a string in OutBuf an identifier for the program point at function
// whose name is in the string table index FuncStrIndex plus Offset.
static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
const Location Loc) {
char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
const Location Loc, uint32_t BufSize) {
// fdata location format: Type Name Offset
// Type 1 - regular symbol
OutBuf = strCopy(OutBuf, "1 ");
const char *Str = Info.Strings + Loc.FunctionName;
uint32_t Size = 25;
while (*Str) {
*OutBuf++ = *Str++;
if (++Size >= BufSize)
break;
}
assert(!*Str, "buffer overflow, function name too large");
*OutBuf++ = ' ';
OutBuf = intToStr(OutBuf, Loc.Offset, 16);
*OutBuf++ = ' ';
@ -204,11 +380,12 @@ static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
// Read and map to memory the descriptions written by BOLT into the executable's
// notes section
static InstrumentationInfo readDescriptions() {
InstrumentationInfo readDescriptions() {
InstrumentationInfo Result;
uint64_t FD = __open("/proc/self/exe",
/*flags=*/0 /*O_RDONLY*/,
/*mode=*/0666);
assert(static_cast<int64_t>(FD) > 0, "Failed to open /proc/self/exe");
Result.FileDesc = FD;
// mmap our binary to memory
@ -232,18 +409,20 @@ static InstrumentationInfo readDescriptions() {
continue;
}
// Actual contents of the ELF note start after offset 20 decimal:
// Offset 0: Producer name size (4 bytes)
// Offset 4: Contents size (4 bytes)
// Offset 8: Note type (4 bytes)
// Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
// Offset 20: Contents
Result.Descriptions =
reinterpret_cast<EdgeDescription *>(BinContents + Shdr->sh_offset + 20);
// String table is located after the full EdgeDescriptions table containing
// __bolt_instr_num_locs entries is finished
Result.Strings = reinterpret_cast<char *>(
BinContents + Shdr->sh_offset + 20 +
(__bolt_instr_num_locs * sizeof(EdgeDescription)));
// Offset 0: Producer name size (4 bytes)
// Offset 4: Contents size (4 bytes)
// Offset 8: Note type (4 bytes)
// Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
// Offset 20: Contents
uint32_t CallDescSize =
*reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
BinContents + Shdr->sh_offset + 24 + CallDescSize);
Result.CallDescriptions =
reinterpret_cast<CallDescription *>(BinContents + Shdr->sh_offset + 24);
Result.FuncDescriptions = BinContents + Shdr->sh_offset + 28 + CallDescSize;
Result.Strings = reinterpret_cast<char *>(BinContents + Shdr->sh_offset +
28 + CallDescSize + FuncDescSize);
return Result;
}
const char ErrMsg[] =
@ -253,6 +432,419 @@ static InstrumentationInfo readDescriptions() {
return Result;
}
void printStats(const InstrumentationInfo &Info) {
char StatMsg[BufSize];
char *StatPtr = StatMsg;
StatPtr = strCopy(
StatPtr, "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nCallDescSize: ");
StatPtr = intToStr(StatPtr,
Info.FuncDescriptions -
reinterpret_cast<uint8_t *>(Info.CallDescriptions),
10);
StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
StatPtr = intToStr(
StatPtr,
reinterpret_cast<uint8_t *>(Info.Strings) - Info.FuncDescriptions, 10);
StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_calls: ");
StatPtr = intToStr(StatPtr, __bolt_instr_num_calls, 10);
StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
StatPtr = strCopy(StatPtr, "\n");
__write(2, StatMsg, StatPtr - StatMsg);
}
/// This is part of a simple CFG representation in memory, where we store
/// a dynamically sized array of input and output edges per node, and store
/// a dynamically sized array of nodes per graph. We also store the spanning
/// tree edges for that CFG in a separate array of nodes in
/// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
struct Edge {
uint32_t Node; // Index in nodes array regarding the destination of this edge
uint32_t ID; // Edge index in an array comprising all edges of the graph
};
/// A regular graph node or a spanning tree node
struct Node {
uint32_t NumInEdges{0}; // Input edge count used to size InEdge
uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
Edge *InEdges{nullptr}; // Created and managed by \p Graph
Edge *OutEdges{nullptr}; // ditto
};
/// Main class for CFG representation in memory. Manages object creation and
/// destruction, populates an array of CFG nodes as well as corresponding
/// spanning tree nodes.
struct Graph {
uint32_t NumNodes;
Node *CFGNodes;
Node *SpanningTreeNodes;
BumpPtrAllocator &Alloc;
/// Reads a list of \p NumEdgeDescs descriptions in \p EdgeDescs and builds
/// the graph from it. Allocates several internal dynamic structures that are
/// later destroyed by ~Graph() and uses \p Alloc. \p LeafNodes contain all
/// spanning tree leaf nodes descriptions (their counters). They are the seed
/// used to compute the rest of the missing edge counts in a bottom-up
/// traversal of the spanning tree.
Graph(BumpPtrAllocator &Alloc, const EdgeDescription *EdgeDescs,
uint32_t NumEdgeDescs, const InstrumentedNode *LeafNodes,
uint32_t NumLeafNodes);
~Graph();
void dump() const;
};
Graph::Graph(BumpPtrAllocator &Alloc, const EdgeDescription *EdgeDescs,
uint32_t NumEdgeDescs, const InstrumentedNode *LeafNodes,
uint32_t NumLeafNodes) : Alloc(Alloc) {
DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
// First pass to determine number of nodes
uint32_t MaxNodes = 0;
for (int I = 0; I < NumEdgeDescs; ++I) {
if (EdgeDescs[I].FromNode > MaxNodes)
MaxNodes = EdgeDescs[I].FromNode;
if (EdgeDescs[I].ToNode > MaxNodes)
MaxNodes = EdgeDescs[I].ToNode;
}
for (int I = 0; I < NumLeafNodes; ++I) {
if (LeafNodes[I].Node > MaxNodes)
MaxNodes = LeafNodes[I].Node;
}
// No edges? Nothing to do
if (!MaxNodes) {
CFGNodes = nullptr;
SpanningTreeNodes = nullptr;
NumNodes = 0;
return;
}
++MaxNodes;
DEBUG(reportNumber("NumNodes = ", MaxNodes, 10));
NumNodes = MaxNodes;
// Initial allocations
CFGNodes = new (Alloc) Node[MaxNodes];
DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
SpanningTreeNodes = new (Alloc) Node[MaxNodes];
DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
(uint64_t)SpanningTreeNodes, 16));
// Figure out how much to allocate to each vector (in/out edge sets)
for (int I = 0; I < NumEdgeDescs; ++I) {
CFGNodes[EdgeDescs[I].FromNode].NumOutEdges++;
CFGNodes[EdgeDescs[I].ToNode].NumInEdges++;
if (EdgeDescs[I].Counter != 0xffffffff)
continue;
SpanningTreeNodes[EdgeDescs[I].FromNode].NumOutEdges++;
SpanningTreeNodes[EdgeDescs[I].ToNode].NumInEdges++;
}
// Allocate in/out edge sets
for (int I = 0; I < MaxNodes; ++I) {
if (CFGNodes[I].NumInEdges > 0)
CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
if (CFGNodes[I].NumOutEdges > 0)
CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
if (SpanningTreeNodes[I].NumInEdges > 0)
SpanningTreeNodes[I].InEdges =
new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
if (SpanningTreeNodes[I].NumOutEdges > 0)
SpanningTreeNodes[I].OutEdges =
new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
CFGNodes[I].NumInEdges = 0;
CFGNodes[I].NumOutEdges = 0;
SpanningTreeNodes[I].NumInEdges = 0;
SpanningTreeNodes[I].NumOutEdges = 0;
}
// Fill in/out edge sets
for (int I = 0; I < NumEdgeDescs; ++I) {
const uint32_t Src = EdgeDescs[I].FromNode;
const uint32_t Dst = EdgeDescs[I].ToNode;
Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
E->Node = Dst;
E->ID = I;
E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
E->Node = Src;
E->ID = I;
if (EdgeDescs[I].Counter != 0xffffffff)
continue;
E = &SpanningTreeNodes[Src]
.OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
E->Node = Dst;
E->ID = I;
E = &SpanningTreeNodes[Dst]
.InEdges[SpanningTreeNodes[Dst].NumInEdges++];
E->Node = Src;
E->ID = I;
}
}
Graph::~Graph() {
for (int I = NumNodes - 1; I >= 0; --I) {
if (SpanningTreeNodes[I].OutEdges)
Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
if (SpanningTreeNodes[I].InEdges)
Alloc.deallocate(SpanningTreeNodes[I].InEdges);
if (CFGNodes[I].OutEdges)
Alloc.deallocate(CFGNodes[I].OutEdges);
if (CFGNodes[I].InEdges)
Alloc.deallocate(CFGNodes[I].InEdges);
}
if (SpanningTreeNodes)
Alloc.deallocate(SpanningTreeNodes);
if (CFGNodes)
Alloc.deallocate(CFGNodes);
}
void Graph::dump() const {
reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
report(" Full graph:\n");
for (int I = 0; I < NumNodes; ++I) {
const Node *N = &CFGNodes[I];
reportNumber(" Node #", I, 10);
reportNumber(" InEdges total ", N->NumInEdges, 10);
for (int J = 0; J < N->NumInEdges; ++J)
reportNumber(" ", N->InEdges[J].Node, 10);
reportNumber(" OutEdges total ", N->NumOutEdges, 10);
for (int J = 0; J < N->NumOutEdges; ++J)
reportNumber(" ", N->OutEdges[J].Node, 10);
report("\n");
}
report(" Spanning tree:\n");
for (int I = 0; I < NumNodes; ++I) {
const Node *N = &SpanningTreeNodes[I];
reportNumber(" Node #", I, 10);
reportNumber(" InEdges total ", N->NumInEdges, 10);
for (int J = 0; J < N->NumInEdges; ++J)
reportNumber(" ", N->InEdges[J].Node, 10);
reportNumber(" OutEdges total ", N->NumOutEdges, 10);
for (int J = 0; J < N->NumOutEdges; ++J)
reportNumber(" ", N->OutEdges[J].Node, 10);
report("\n");
}
}
void dumpEdgeFreqs(const uint64_t *EdgeFreqs, const EdgeDescription *EdgeDescs,
uint32_t NumEdges) {
reportNumber("Dumping edge frequencies for graph with num edges: ", NumEdges,
10);
for (int I = 0; I < NumEdges; ++I) {
reportNumber("* Src: ", EdgeDescs[I].FromNode, 10);
reportNumber(" Dst: ", EdgeDescs[I].ToNode, 10);
reportNumber(" Cnt: ", EdgeFreqs[I], 10);
}
}
// Return an array with the frequency of each edge in the function represented
// by G.
uint64_t *computeEdgeFrequencies(BumpPtrAllocator &Alloc, Graph *G,
const EdgeDescription *EdgeDescs,
uint32_t NumEdges,
const InstrumentedNode *LeafNodes,
uint32_t NumLeafNodes,
const uint64_t *Counters) {
if (G->NumNodes == 0 || NumEdges == 0)
return 0;
assert(NumLeafNodes > 0, "no leaf node frequency");
uint64_t *EdgeFrequency = new (Alloc, 0) uint64_t [NumEdges];
// Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
// spanning tree don't have explicit counters. We must infer their value using
// a linear combination of other counters (sum of counters of the outgoing
// edges minus sum of counters of the incoming edges).
uint32_t *Stack = new (Alloc) uint32_t [G->NumNodes];
uint32_t StackTop = 0;
enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
Status *Visited = new (Alloc, 0) Status[G->NumNodes];
uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[G->NumNodes];
// Setup a fast lookup for frequency of leaf nodes, which have special
// basic block frequency instrumentation (they are not edge profiled).
uint64_t TotalFreq = 0;
for (int I = 0; I < NumLeafNodes; ++I) {
LeafFrequency[LeafNodes[I].Node] = Counters[LeafNodes[I].Counter];
DEBUG({
if (Counters[LeafNodes[I].Counter] > 0) {
reportNumber("Leaf Node# ", LeafNodes[I].Node, 10);
reportNumber(" Counter: ", Counters[LeafNodes[I].Counter], 10);
}
});
TotalFreq += Counters[LeafNodes[I].Counter];
}
// Add all root nodes to the stack
for (int I = 0; I < G->NumNodes; ++I) {
if (G->SpanningTreeNodes[I].NumInEdges == 0)
Stack[StackTop++] = I;
}
// Empty stack?
if (StackTop == 0) {
Alloc.deallocate(LeafFrequency);
Alloc.deallocate(Visited);
Alloc.deallocate(Stack);
Alloc.deallocate(EdgeFrequency);
return 0;
}
// Add all known edge counts, will infer the rest
for (int I = 0; I < NumEdges; ++I) {
const uint32_t C = EdgeDescs[I].Counter;
if (C == 0xffffffff) // inferred counter - we will compute its value
continue;
EdgeFrequency[I] = Counters[C];
TotalFreq += Counters[C];
}
// This function is completely cold, there is no point in computing anything
// since inferred edges will be zero too.
#ifndef ENABLE_DEBUG
if (TotalFreq == 0) {
Alloc.deallocate(LeafFrequency);
Alloc.deallocate(Visited);
Alloc.deallocate(Stack);
return EdgeFrequency;
}
#endif
while (StackTop > 0) {
const uint32_t Cur = Stack[--StackTop];
DEBUG({
if (Visited[Cur] == S_VISITING)
report("(visiting) ");
else
report("(new) ");
reportNumber("Cur: ", Cur, 10);
});
// This shouldn't happen in a tree
assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
if (Visited[Cur] == S_NEW) {
Visited[Cur] = S_VISITING;
Stack[StackTop++] = Cur;
assert(StackTop <= G->NumNodes, "stack grew too large");
for (int I = 0, E = G->SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
const uint32_t Succ = G->SpanningTreeNodes[Cur].OutEdges[I].Node;
Stack[StackTop++] = Succ;
assert(StackTop <= G->NumNodes, "stack grew too large");
}
continue;
}
Visited[Cur] = S_VISITED;
// No parent? Reached a tree root, nothing to do.
if (G->SpanningTreeNodes[Cur].NumInEdges == 0)
continue;
assert(G->SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
const uint32_t Parent = G->SpanningTreeNodes[Cur].InEdges[0].Node;
const uint32_t ParentEdge = G->SpanningTreeNodes[Cur].InEdges[0].ID;
// Establish our node frequency based on outgoing edges, which should all be
// resolved by now.
int64_t CurNodeFreq = LeafFrequency[Cur];
// Not a leaf?
if (!CurNodeFreq) {
for (int I = 0, E = G->CFGNodes[Cur].NumOutEdges; I != E; ++I) {
const uint32_t SuccEdge = G->CFGNodes[Cur].OutEdges[I].ID;
CurNodeFreq += EdgeFrequency[SuccEdge];
}
}
int64_t ParentEdgeFreq = CurNodeFreq;
// Calculate parent edge freq.
for (int I = 0, E = G->CFGNodes[Cur].NumInEdges; I != E; ++I) {
const uint32_t PredEdge = G->CFGNodes[Cur].InEdges[I].ID;
ParentEdgeFreq -= EdgeFrequency[PredEdge];
}
// Sometimes the conservative CFG that BOLT builds will lead to incorrect
// flow computation. For example, in a BB that transitively calls the exit
// syscall, BOLT will add a fall-through successor even though it should not
// have any successors. So this block execution will likely be wrong. We
// tolerate this imperfection since this case should be quite infrequent.
if (ParentEdgeFreq < 0) {
DEBUG(dumpEdgeFreqs(EdgeFrequency, EdgeDescs, NumEdges));
DEBUG(report("WARNING: incorrect flow"));
ParentEdgeFreq = 0;
}
DEBUG(reportNumber(" Setting freq for ParentEdge: ", ParentEdge, 10));
DEBUG(reportNumber(" with ParentEdgeFreq: ", ParentEdgeFreq, 10));
EdgeFrequency[ParentEdge] = ParentEdgeFreq;
}
Alloc.deallocate(LeafFrequency);
Alloc.deallocate(Visited);
Alloc.deallocate(Stack);
return EdgeFrequency;
}
// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
// \p Alloc to allocate helper dynamic structures used to compute profile for
// edges that we do not explictly instrument.
uint8_t *writeFunctionProfile(int FD, const InstrumentationInfo &Info,
uint8_t *FuncDesc, BumpPtrAllocator &Alloc) {
uint32_t NumLeafNodes = *reinterpret_cast<uint32_t *>(FuncDesc);
DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10));
InstrumentedNode *LeafNodes =
reinterpret_cast<InstrumentedNode *>(FuncDesc + 4);
uint32_t NumEdges = *reinterpret_cast<uint32_t *>(
FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
DEBUG(reportNumber("NumEdges = ", NumEdges, 10));
EdgeDescription *EdgeDescs = reinterpret_cast<EdgeDescription *>(
FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
uint8_t *next = (FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
NumEdges * sizeof(EdgeDescription));
// Skip funcs we know are cold
#ifndef ENABLE_DEBUG
uint64_t LeafFreq = 0;
for (int I = 0; I < NumLeafNodes; ++I) {
LeafFreq += __bolt_instr_locations[LeafNodes[I].Counter];
}
if (LeafFreq == 0)
return next;
#endif
Graph *G =
new (Alloc) Graph(Alloc, EdgeDescs, NumEdges, LeafNodes, NumLeafNodes);
DEBUG(G->dump());
uint64_t *Freqs =
computeEdgeFrequencies(Alloc, G, EdgeDescs, NumEdges, LeafNodes,
NumLeafNodes, __bolt_instr_locations);
if (!Freqs) {
G->~Graph();
Alloc.deallocate(G);
return next;
}
DEBUG(dumpEdgeFreqs(Freqs, EdgeDescs, NumEdges));
for (int I = 0; I < NumEdges; ++I) {
const uint64_t Freq = Freqs[I];
if (Freq == 0)
continue;
const EdgeDescription *Desc = &EdgeDescs[I];
char LineBuf[BufSize];
char *Ptr = LineBuf;
Ptr = serializeLoc(Info, Ptr, Desc->From, BufSize);
Ptr = serializeLoc(Info, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
Ptr = intToStr(Ptr, Freq, 10);
*Ptr++ = '\n';
__write(FD, LineBuf, Ptr - LineBuf);
}
Alloc.deallocate(Freqs);
G->~Graph();
Alloc.deallocate(G);
return next;
}
} // anonymous namespace
// This is the entry point called at program exit. BOLT patches the executable's
// FINI entry in the .dynamic section with the address of this function. Our
// goal here is to flush to disk all instrumentation data in memory, using
@ -260,25 +852,43 @@ static InstrumentationInfo readDescriptions() {
extern "C" void __bolt_instr_data_dump() {
const InstrumentationInfo Info = readDescriptions();
DEBUG(printStats(Info));
uint64_t FD = __open(__bolt_instr_filename,
/*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
/*mode=*/0666);
if (static_cast<int64_t>(FD) < 0) {
reportNumber("Assertion error: failed to open profile file for writing. "
"Error number: 0x",
0 - static_cast<int64_t>(FD), 16);
__exit(1);
}
for (int I = 0, E = __bolt_instr_num_locs; I < E; ++I) {
char LineBuf[2000];
for (int I = 0, E = __bolt_instr_num_calls; I < E; ++I) {
char LineBuf[BufSize];
char *Ptr = LineBuf;
uint32_t HitCount = __bolt_instr_locations[I];
CallDescription *Desc = &Info.CallDescriptions[I];
uint64_t HitCount = __bolt_instr_locations[Desc->Counter];
if (!HitCount)
continue;
EdgeDescription *Desc = &Info.Descriptions[I];
Ptr = serializeLoc(Info, Ptr, Desc->From);
Ptr = serializeLoc(Info, Ptr, Desc->To);
Ptr = strCopy(Ptr, "0 ");
Ptr = serializeLoc(Info, Ptr, Desc->From, BufSize);
Ptr = serializeLoc(Info, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
Ptr = intToStr(Ptr, HitCount, 10);
*Ptr++ = '\n';
__write(FD, LineBuf, Ptr - LineBuf);
}
BumpPtrAllocator Alloc;
uint8_t *FuncDesc = Info.FuncDescriptions;
for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
FuncDesc = writeFunctionProfile(FD, Info, FuncDesc, Alloc);
DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
}
assert(FuncDesc == (void *)Info.Strings,
"FuncDesc ptr must be equal to stringtable");
__close(FD);
__munmap(Info.MMapPtr, Info.MMapSize);
__close(Info.FileDesc);

View File

@ -545,6 +545,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
std::pair<uint64_t, const MCSymbol *>
BinaryContext::duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
const MCSymbol *OldLabel) {
auto L = scopeLock();
unsigned Offset = 0;
bool Found = false;
for (auto Elmt : JT->Labels) {

View File

@ -390,6 +390,9 @@ public:
/// A mutex that is used to control parallel accesses to Ctx
mutable std::shared_timed_mutex CtxMutex;
std::unique_lock<std::shared_timed_mutex> scopeLock() const {
return std::unique_lock<std::shared_timed_mutex>(CtxMutex);
}
std::unique_ptr<DWARFContext> DwCtx;

View File

@ -898,7 +898,7 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
}
MCSymbol *Result;
{
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
auto L = BC.scopeLock();
Result = BC.Ctx->createTempSymbol();
}
Labels[Offset] = Result;
@ -1767,7 +1767,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
} else {
MCSymbol *Label;
{
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
auto L = BC.scopeLock();
Label = BC.Ctx->createTempSymbol("FT", true);
}
InsertBB = addBasicBlock(
@ -3304,12 +3304,12 @@ void BinaryFunction::fixBranches() {
if (NextBB && NextBB == TSuccessor) {
std::swap(TSuccessor, FSuccessor);
{
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
auto L = BC.scopeLock();
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx);
}
BB->swapConditionalSuccessors();
} else {
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
auto L = BC.scopeLock();
MIB->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx);
}
if (TSuccessor == FSuccessor) {
@ -3324,7 +3324,7 @@ void BinaryFunction::fixBranches() {
BB->isCold() != TSuccessor->isCold()) {
std::swap(TSuccessor, FSuccessor);
{
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
auto L = BC.scopeLock();
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(),
Ctx);
}
@ -3675,7 +3675,8 @@ bool BinaryFunction::checkForAmbiguousJumpTables() {
return false;
}
void BinaryFunction::disambiguateJumpTables() {
void BinaryFunction::disambiguateJumpTables(
MCPlusBuilder::AllocatorIdTy AllocId) {
assert((opts::JumpTables != JTS_BASIC && isSimple()) || BC.HasRelocations);
SmallPtrSet<JumpTable *, 4> JumpTables;
for (auto &BB : BasicBlocks) {
@ -3744,10 +3745,13 @@ void BinaryFunction::disambiguateJumpTables() {
const MCSymbol *NewJTLabel;
std::tie(NewJumpTableID, NewJTLabel) =
BC.duplicateJumpTable(*this, JT, Target);
BC.MIB->replaceMemOperandDisp(*JTLoadInst, NewJTLabel, BC.Ctx.get());
{
auto L = BC.scopeLock();
BC.MIB->replaceMemOperandDisp(*JTLoadInst, NewJTLabel, BC.Ctx.get());
}
// We use a unique ID with the high bit set as address for this "injected"
// jump table (not originally in the input binary).
BC.MIB->setJumpTable(Inst, NewJumpTableID, 0);
BC.MIB->setJumpTable(Inst, NewJumpTableID, 0, AllocId);
}
}
}
@ -3773,7 +3777,7 @@ BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From,
// Create intermediate BB
MCSymbol *Tmp;
{
std::unique_lock<std::shared_timed_mutex> WrLock(BC.CtxMutex);
auto L = BC.scopeLock();
Tmp = BC.Ctx->createTempSymbol("SplitEdge", true);
}
// Link new BBs to the original input offset of the From BB, so we can map

View File

@ -1487,7 +1487,7 @@ public:
/// by an indirect branch, e.g.: instrumentation or shrink wrapping. However,
/// this is only possible if we are not updating jump tables in place, but are
/// writing it to a new location (moving them).
void disambiguateJumpTables();
void disambiguateJumpTables(MCPlusBuilder::AllocatorIdTy AllocId);
/// Change \p OrigDest to \p NewDest in the jump table used at the end of
/// \p BB. Returns false if \p OrigDest couldn't be find as a valid target

View File

@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//
#include "Instrumentation.h"
#include "ParallelUtilities.h"
#include "Passes/DataflowInfoManager.h"
#include "llvm/Support/Options.h"
@ -35,6 +36,13 @@ cl::opt<bool> InstrumentHotOnly(
cl::init(false),
cl::Optional,
cl::cat(BoltCategory));
cl::opt<bool> InstrumentCalls(
"instrument-calls",
cl::desc("record profile for inter-function control flow activity"),
cl::init(false),
cl::Optional,
cl::cat(BoltCategory));
}
namespace llvm {
@ -51,47 +59,105 @@ uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
return Idx;
}
Instrumentation::CounterDescription Instrumentation::createDescription(
void Instrumentation::createCallDescription(
const BinaryFunction &FromFunction, uint32_t From,
const BinaryFunction &ToFunction, uint32_t To) {
CounterDescription Res;
Res.FromFuncStringIdx = getFunctionNameIndex(FromFunction);
Res.FromOffset = From;
Res.ToFuncStringIdx = getFunctionNameIndex(ToFunction);
Res.ToOffset = To;
return Res;
CallDescription CD;
CD.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
CD.FromLoc.Offset = From;
CD.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
CD.ToLoc.Offset = To;
CD.Counter = Counters.size();
CallDescriptions.emplace_back(CD);
}
std::vector<MCInst> Instrumentation::createInstrumentationSnippet(
BinaryFunction &FromFunction, uint32_t FromOffset, BinaryFunction &ToFunc,
uint32_t ToOffset) {
Descriptions.emplace_back(
createDescription(FromFunction, FromOffset, ToFunc, ToOffset));
bool Instrumentation::createEdgeDescription(
FunctionDescription &FuncDesc,
const BinaryFunction &FromFunction, uint32_t From,
uint32_t FromNodeID,
const BinaryFunction &ToFunction, uint32_t To,
uint32_t ToNodeID, bool Instrumented) {
EdgeDescription ED;
auto Result = FuncDesc.EdgesSet.insert(std::make_pair(FromNodeID, ToNodeID));
// Avoid creating duplicated edge descriptions. This happens in CFGs where a
// block jumps to its fall-through.
if (Result.second == false)
return false;
ED.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
ED.FromLoc.Offset = From;
ED.FromNode = FromNodeID;
ED.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
ED.ToLoc.Offset = To;
ED.ToNode = ToNodeID;
ED.Counter = Instrumented ? Counters.size() : 0xffffffff;
FuncDesc.Edges.emplace_back(ED);
return true;
}
BinaryContext &BC = FromFunction.getBinaryContext();
MCSymbol *Label =
BC.Ctx->createTempSymbol("InstrEntry", true);
Labels.emplace_back(Label);
void Instrumentation::createExitNodeDescription(FunctionDescription &FuncDesc,
uint32_t Node) {
InstrumentedNode IN;
IN.Node = Node;
IN.Counter = Counters.size();
FuncDesc.ExitNodes.emplace_back(IN);
}
std::vector<MCInst>
Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) {
auto L = BC.scopeLock();
MCSymbol *Label;
Label = BC.Ctx->createTempSymbol("InstrEntry", true);
Counters.emplace_back(Label);
std::vector<MCInst> CounterInstrs(5);
// Don't clobber application red zone (ABI dependent)
BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
/*NoFlagsClobber=*/true);
if (IsLeaf)
BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
/*NoFlagsClobber=*/true);
BC.MIB->createPushFlags(CounterInstrs[1], 2);
BC.MIB->createIncMemory(CounterInstrs[2], Label, &*BC.Ctx);
BC.MIB->createPopFlags(CounterInstrs[3], 2);
BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
/*NoFlagsClobber=*/true);
if (IsLeaf)
BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
/*NoFlagsClobber=*/true);
return CounterInstrs;
}
bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
BinaryFunction &FromFunction,
BinaryBasicBlock &FromBB,
uint32_t From, BinaryFunction &ToFunc,
BinaryBasicBlock *TargetBB,
uint32_t ToOffset) {
void Instrumentation::instrumentExitNode(BinaryContext &BC,
BinaryBasicBlock &BB,
BinaryBasicBlock::iterator Iter,
bool IsLeaf,
FunctionDescription &FuncDesc,
uint32_t Node) {
createExitNodeDescription(FuncDesc, Node);
std::vector<MCInst> CounterInstrs = createInstrumentationSnippet(BC, IsLeaf);
for (auto &NewInst : CounterInstrs) {
Iter = BB.insertInstruction(Iter, NewInst);
++Iter;
}
}
bool Instrumentation::instrumentOneTarget(
SplitWorklistTy &SplitWorklist, SplitInstrsTy &SplitInstrs,
BinaryBasicBlock::iterator &Iter, BinaryFunction &FromFunction,
BinaryBasicBlock &FromBB, uint32_t From, BinaryFunction &ToFunc,
BinaryBasicBlock *TargetBB, uint32_t ToOffset, bool IsLeaf,
FunctionDescription *FuncDesc, uint32_t FromNodeID, uint32_t ToNodeID) {
{
auto L = FromFunction.getBinaryContext().scopeLock();
bool Created{true};
if (!TargetBB)
createCallDescription(FromFunction, From, ToFunc, ToOffset);
else
Created = createEdgeDescription(*FuncDesc, FromFunction, From, FromNodeID,
ToFunc, ToOffset, ToNodeID,
/*Instrumented=*/true);
if (!Created)
return false;
}
std::vector<MCInst> CounterInstrs =
createInstrumentationSnippet(FromFunction, From, ToFunc, ToOffset);
createInstrumentationSnippet(FromFunction.getBinaryContext(), IsLeaf);
BinaryContext &BC = FromFunction.getBinaryContext();
const MCInst &Inst = *Iter;
@ -103,7 +169,7 @@ bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
return true;
}
if (!TargetBB)
if (!TargetBB || !FuncDesc)
return false;
// Indirect branch, conditional branches or fall-throughs
@ -130,6 +196,199 @@ bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
return true;
}
void Instrumentation::instrumentFunction(BinaryContext &BC,
BinaryFunction &Function,
MCPlusBuilder::AllocatorIdTy AllocId) {
SplitWorklistTy SplitWorklist;
SplitInstrsTy SplitInstrs;
FunctionDescription *FuncDesc = nullptr;
{
std::unique_lock<std::shared_timed_mutex> L(FDMutex);
FunctionDescriptions.emplace_back();
FuncDesc = &FunctionDescriptions.back();
}
Function.disambiguateJumpTables(AllocId);
std::unordered_map<const BinaryBasicBlock *, uint32_t> BBToID;
uint32_t Id = 0;
for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
BBToID[&*BBI] = Id++;
}
std::unordered_set<const BinaryBasicBlock *> VisitedSet;
// DFS to establish edges we will use for a spanning tree. Edges in the
// spanning tree can be instrumentation-free since their count can be
// inferred by solving flow equations on a bottom-up traversal of the tree.
// Exit basic blocks are always instrumented so we start the traversal with
// a minimum number of defined variables to make the equation solvable.
std::stack<std::pair<const BinaryBasicBlock *, BinaryBasicBlock *>> Stack;
std::unordered_map<const BinaryBasicBlock *,
std::set<const BinaryBasicBlock *>>
STOutSet;
for (auto BBI = Function.layout_rbegin(); BBI != Function.layout_rend();
++BBI) {
if ((*BBI)->isEntryPoint())
Stack.push(std::make_pair(nullptr, *BBI));
}
// Modified version of BinaryFunction::dfs() to build a spanning tree
while (!Stack.empty()) {
BinaryBasicBlock *BB;
const BinaryBasicBlock *Pred;
std::tie(Pred, BB) = Stack.top();
Stack.pop();
if (VisitedSet.find(BB) != VisitedSet.end())
continue;
VisitedSet.insert(BB);
if (Pred)
STOutSet[Pred].insert(BB);
for (auto *SuccBB : BB->landing_pads())
Stack.push(std::make_pair(BB, SuccBB));
for (auto *SuccBB : BB->successors())
Stack.push(std::make_pair(BB, SuccBB));
}
// Determine whether this is a leaf function, which needs special
// instructions to protect the red zone
bool IsLeafFunction{true};
for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
for (auto I = BBI->begin(), E = BBI->end(); I != E; ++I) {
if (BC.MIB->isCall(*I)) {
IsLeafFunction = false;
break;
}
}
if (!IsLeafFunction)
break;
}
for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
auto &BB{*BBI};
bool HasUnconditionalBranch{false};
bool HasJumpTable{false};
for (auto I = BB.begin(); I != BB.end(); ++I) {
const auto &Inst = *I;
if (!BC.MIB->hasAnnotation(Inst, "Offset"))
continue;
const bool IsJumpTable = Function.getJumpTable(Inst);
if (IsJumpTable)
HasJumpTable = true;
else if (BC.MIB->isUnconditionalBranch(Inst))
HasUnconditionalBranch = true;
else if ((!BC.MIB->isCall(Inst) && !BC.MIB->isConditionalBranch(Inst)) ||
BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
continue;
uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
BinaryFunction *TargetFunc =
TargetBB ? &Function : BC.getFunctionForSymbol(Target);
// Should be null for indirect branches/calls
if (TargetFunc && !TargetBB) {
if (opts::InstrumentCalls)
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
FromOffset, *TargetFunc, TargetBB, ToOffset,
IsLeafFunction);
continue;
}
if (TargetFunc) {
// Do not instrument edges in the spanning tree
if (STOutSet[&BB].find(TargetBB) != STOutSet[&BB].end()) {
auto L = BC.scopeLock();
createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
Function, ToOffset, BBToID[TargetBB],
/*Instrumented=*/false);
continue;
}
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
FromOffset, *TargetFunc, TargetBB, ToOffset,
IsLeafFunction, FuncDesc, BBToID[&BB],
BBToID[TargetBB]);
continue;
}
if (IsJumpTable) {
for (auto &Succ : BB.successors()) {
// Do not instrument edges in the spanning tree
if (STOutSet[&BB].find(&*Succ) != STOutSet[&BB].end()) {
auto L = BC.scopeLock();
createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
Function, Succ->getInputOffset(),
BBToID[&*Succ], /*Instrumented=*/false);
continue;
}
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
FromOffset, Function, &*Succ,
Succ->getInputOffset(), IsLeafFunction, FuncDesc,
BBToID[&BB], BBToID[&*Succ]);
}
continue;
}
// FIXME: handle indirect calls
} // End of instructions loop
// Instrument fallthroughs (when the direct jump instruction is missing)
if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
BB.size() > 0) {
auto *FTBB = BB.getFallthrough();
assert(FTBB && "expected valid fall-through basic block");
auto I = BB.begin();
auto LastInstr = BB.end();
--LastInstr;
while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
--LastInstr;
uint32_t FromOffset = 0;
// The last instruction in the BB should have an annotation, except
// if it was branching to the end of the function as a result of
// __builtin_unreachable(), in which case it was deleted by fixBranches.
// Ignore this case. FIXME: force fixBranches() to preserve the offset.
if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
continue;
FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
// Do not instrument edges in the spanning tree
if (STOutSet[&BB].find(FTBB) != STOutSet[&BB].end()) {
auto L = BC.scopeLock();
createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
Function, FTBB->getInputOffset(), BBToID[FTBB],
/*Instrumented=*/false);
continue;
}
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
FromOffset, Function, FTBB, FTBB->getInputOffset(),
IsLeafFunction, FuncDesc, BBToID[&BB], BBToID[FTBB]);
}
} // End of BBs loop
// Instrument spanning tree leaves
for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
auto &BB{*BBI};
if (STOutSet[&BB].size() == 0 && BB.size() > 0)
instrumentExitNode(BC, BB, BB.begin(), IsLeafFunction, *FuncDesc,
BBToID[&BB]);
}
// Consume list of critical edges: split them and add instrumentation to the
// newly created BBs
auto Iter = SplitInstrs.begin();
for (auto &BBPair : SplitWorklist) {
auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
NewBB->addInstructions(Iter->begin(), Iter->end());
++Iter;
}
// Unused now
FuncDesc->EdgesSet.clear();
}
void Instrumentation::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
return;
@ -146,100 +405,28 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
/*Alignment=*/1,
/*IsReadOnly=*/true, ELF::SHT_NOTE);
uint64_t InstrumentationSites{0ULL};
uint64_t InstrumentationSitesSavingFlags{0ULL};
for (auto &BFI : BC.getBinaryFunctions()) {
BinaryFunction &Function = BFI.second;
if (!Function.isSimple() || !opts::shouldProcess(Function)
|| (opts::InstrumentHotOnly && !Function.getKnownExecutionCount()))
continue;
Function.disambiguateJumpTables();
SplitWorklist.clear();
SplitInstrs.clear();
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
return (!BF.isSimple() || !opts::shouldProcess(BF) ||
(opts::InstrumentHotOnly && !BF.getKnownExecutionCount()));
};
for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
auto &BB{*BBI};
bool HasUnconditionalBranch{false};
bool HasJumpTable{false};
ParallelUtilities::WorkFuncWithAllocTy WorkFun =
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
instrumentFunction(BC, BF, AllocatorId);
};
for (auto I = BB.begin(); I != BB.end(); ++I) {
const auto &Inst = *I;
if (!BC.MIB->hasAnnotation(Inst, "Offset"))
continue;
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFun,
SkipPredicate, "instrumentation", /* ForceSequential=*/true);
}
const bool IsJumpTable = Function.getJumpTable(Inst);
if (IsJumpTable)
HasJumpTable = true;
else if (BC.MIB->isUnconditionalBranch(Inst))
HasUnconditionalBranch = true;
else if ((!BC.MIB->isCall(Inst) &&
!BC.MIB->isConditionalBranch(Inst)) ||
BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
continue;
uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
BinaryFunction *TargetFunc =
TargetBB ? &Function : BC.getFunctionForSymbol(Target);
// Should be null for indirect branches/calls
if (TargetFunc) {
if (instrumentOneTarget(I, Function, BB, FromOffset, *TargetFunc,
TargetBB, ToOffset))
++InstrumentationSites;
continue;
}
if (IsJumpTable) {
for (auto &Succ : BB.successors()) {
if (instrumentOneTarget(I, Function, BB, FromOffset, Function,
&*Succ, Succ->getInputOffset()))
++InstrumentationSites;
}
continue;
}
// FIXME: handle indirect calls
} // End of instructions loop
// Instrument fallthroughs (when the direct jump instruction is missing)
if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
BB.size() > 0) {
auto *FTBB = BB.getFallthrough();
assert(FTBB && "expected valid fall-through basic block");
auto I = BB.begin();
auto LastInstr = BB.end();
--LastInstr;
while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
--LastInstr;
uint32_t FromOffset = 0;
// The last instruction in the BB should have an annotation, except
// if it was branching to the end of the function as a result of
// __builtin_unreachable(), in which case it was deleted by fixBranches.
// Ignore this case. FIXME: force fixBranches() to preserve the offset.
if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
continue;
FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
if (instrumentOneTarget(I, Function, BB, FromOffset, Function, FTBB,
FTBB->getInputOffset()))
++InstrumentationSites;
}
} // End of BBs loop
// Consume list of critical edges: split them and add instrumentation to the
// newly created BBs
auto Iter = SplitInstrs.begin();
for (auto &BBPair : SplitWorklist) {
auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
NewBB->addInstructions(Iter->begin(), Iter->end());
++Iter;
}
uint32_t Instrumentation::getFDSize() const {
uint32_t FuncDescSize = 0;
for (const auto &Func : FunctionDescriptions) {
FuncDescSize += 8 + Func.Edges.size() * sizeof(EdgeDescription) +
Func.ExitNodes.size() * sizeof(InstrumentedNode);
}
outs() << "BOLT-INSTRUMENTER: Instrumented " << InstrumentationSites
<< " sites, " << InstrumentationSitesSavingFlags << " saving flags.\n";
return FuncDescSize;
}
void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
@ -247,12 +434,36 @@ void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
raw_string_ostream OS(TablesStr);
// Start of the vector with descriptions (one CounterDescription for each
// counter), vector size is Labels.size() CounterDescription-sized elmts
for (const auto &Desc : Descriptions) {
OS.write(reinterpret_cast<const char *>(&Desc.FromFuncStringIdx), 4);
OS.write(reinterpret_cast<const char *>(&Desc.FromOffset), 4);
OS.write(reinterpret_cast<const char *>(&Desc.ToFuncStringIdx), 4);
OS.write(reinterpret_cast<const char *>(&Desc.ToOffset), 4);
// counter), vector size is Counters.size() CounterDescription-sized elmts
const auto CDSize = CallDescriptions.size() * sizeof(CallDescription);
OS.write(reinterpret_cast<const char *>(&CDSize), 4);
for (const auto &Desc : CallDescriptions) {
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.FuncString), 4);
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.Offset), 4);
OS.write(reinterpret_cast<const char *>(&Desc.ToLoc.FuncString), 4);
OS.write(reinterpret_cast<const char *>(&Desc.ToLoc.Offset), 4);
OS.write(reinterpret_cast<const char *>(&Desc.Counter), 4);
}
const auto FDSize = getFDSize();
OS.write(reinterpret_cast<const char *>(&FDSize), 4);
for (const auto &Desc : FunctionDescriptions) {
const auto ExitsNum = Desc.ExitNodes.size();
OS.write(reinterpret_cast<const char *>(&ExitsNum), 4);
for (const auto &ExitNode : Desc.ExitNodes) {
OS.write(reinterpret_cast<const char *>(&ExitNode.Node), 4);
OS.write(reinterpret_cast<const char *>(&ExitNode.Counter), 4);
}
const auto EdgesNum = Desc.Edges.size();
OS.write(reinterpret_cast<const char *>(&EdgesNum), 4);
for (const auto &Edge : Desc.Edges) {
OS.write(reinterpret_cast<const char *>(&Edge.FromLoc.FuncString), 4);
OS.write(reinterpret_cast<const char *>(&Edge.FromLoc.Offset), 4);
OS.write(reinterpret_cast<const char *>(&Edge.FromNode), 4);
OS.write(reinterpret_cast<const char *>(&Edge.ToLoc.FuncString), 4);
OS.write(reinterpret_cast<const char *>(&Edge.ToLoc.Offset), 4);
OS.write(reinterpret_cast<const char *>(&Edge.ToNode), 4);
OS.write(reinterpret_cast<const char *>(&Edge.Counter), 4);
}
}
// Our string table lives immediately after descriptions vector
OS << StringTable;
@ -278,9 +489,10 @@ void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
// All of the following symbols will be exported as globals to be used by the
// instrumentation runtime library to dump the instrumentation data to disk.
// Label marking start of the memory region containing instrumentation
// counters, total vector size is Labels.size() 8-byte counters
// counters, total vector size is Counters.size() 8-byte counters
MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations");
MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_locs");
MCSymbol *NumCalls = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_calls");
MCSymbol *NumFuncs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_funcs");
/// File name where profile is going to written to after target binary
/// finishes a run
MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename");
@ -289,23 +501,36 @@ void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
Streamer.EmitLabel(Locs);
Streamer.EmitSymbolAttribute(Locs,
MCSymbolAttr::MCSA_Global);
for (const auto &Label : Labels) {
for (const auto &Label : Counters) {
Streamer.EmitLabel(Label);
Streamer.emitFill(8, 0);
}
Streamer.EmitLabel(NumLocs);
Streamer.EmitSymbolAttribute(NumLocs,
Streamer.EmitLabel(NumCalls);
Streamer.EmitSymbolAttribute(NumCalls,
MCSymbolAttr::MCSA_Global);
Streamer.EmitIntValue(Labels.size(), /*Size=*/4);
Streamer.EmitIntValue(CallDescriptions.size(), /*Size=*/4);
Streamer.EmitLabel(NumFuncs);
Streamer.EmitSymbolAttribute(NumFuncs,
MCSymbolAttr::MCSA_Global);
Streamer.EmitIntValue(FunctionDescriptions.size(), /*Size=*/4);
Streamer.EmitLabel(FilenameSym);
Streamer.EmitBytes(opts::InstrumentationFilename);
Streamer.emitFill(1, 0);
uint32_t FuncDescSize = getFDSize();
outs() << "BOLT-INSTRUMENTER: Number of call descriptors: "
<< CallDescriptions.size() << "\n";
outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
<< FunctionDescriptions.size() << "\n";
outs() << "BOLT-INSTRUMENTER: Number of counters: " << Counters.size()
<< "\n";
outs() << "BOLT-INSTRUMENTER: Total size of counters: "
<< (Labels.size() * 8) << " bytes (static alloc memory)\n";
<< (Counters.size() * 8) << " bytes (static alloc memory)\n";
outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
<< StringTable.size() << " bytes in file\n";
outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
<< (Labels.size() * 16) << " bytes in file\n";
<< (FuncDescSize + CallDescriptions.size() * sizeof(CallDescription))
<< " bytes in file\n";
outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
<< opts::InstrumentationFilename << "\n";
}

View File

@ -54,14 +54,42 @@ public:
void emit(BinaryContext &BC, MCStreamer &Streamer);
private:
// Instrumented branch location information
struct CounterDescription {
uint32_t FromFuncStringIdx;
uint32_t FromOffset;
uint32_t ToFuncStringIdx;
uint32_t ToOffset;
// Location information -- this is a location in the program binary
struct LocDescription {
uint32_t FuncString;
uint32_t Offset;
};
// Inter-function control flow transfer instrumentation
struct CallDescription {
LocDescription FromLoc;
LocDescription ToLoc;
uint32_t Counter;
};
// Intra-function control flow transfer instrumentation
struct EdgeDescription {
LocDescription FromLoc;
uint32_t FromNode;
LocDescription ToLoc;
uint32_t ToNode;
uint32_t Counter;
};
struct InstrumentedNode {
uint32_t Node;
uint32_t Counter;
};
struct FunctionDescription {
std::vector<InstrumentedNode> ExitNodes;
std::vector<EdgeDescription> Edges;
DenseSet<std::pair<uint32_t, uint32_t>> EdgesSet;
};
void instrumentFunction(BinaryContext &BC, BinaryFunction &Function,
MCPlusBuilder::AllocatorIdTy = 0);
/// Retrieve the string table index for the name of \p Function. We encode
/// instrumented locations descriptions with the aid of a string table to
/// manage memory of the instrumentation runtime in a more efficient way.
@ -73,53 +101,69 @@ private:
/// branch source location in terms of function name plus offset, as well as
/// branch destination (also name + offset). This will be encoded in the
/// binary as static data and function name strings will reference a strtab.
CounterDescription createDescription(const BinaryFunction &FromFunction,
uint32_t From,
const BinaryFunction &ToFunction,
uint32_t To);
void createCallDescription(const BinaryFunction &FromFunction, uint32_t From,
const BinaryFunction &ToFunction, uint32_t To);
bool createEdgeDescription(FunctionDescription &FuncDesc,
const BinaryFunction &FromFunction, uint32_t From,
uint32_t FromNodeID,
const BinaryFunction &ToFunction, uint32_t To,
uint32_t ToNodeID, bool Instrumented);
void createExitNodeDescription(FunctionDescription &FuncDesc, uint32_t Node);
/// Create the sequence of instructions to instrument a branch happening
/// at \p FromFunction + \p FromOffset to \p ToFunc + \p ToOffset
std::vector<MCInst> createInstrumentationSnippet(BinaryFunction &FromFunction,
uint32_t FromOffset,
BinaryFunction &ToFunc,
uint32_t ToOffset);
std::vector<MCInst> createInstrumentationSnippet(BinaryContext &BC,
bool IsLeaf);
// Critical edges worklist
// This worklist keeps track of CFG edges <From-To> that needs to be split.
// This task is deferred until we finish processing all BBs because we can't
// modify the CFG while iterating over it. For each edge, \p SplitInstrsTy
// stores the list of instrumentation instructions as a vector of MCInsts.
// instrumentOneTarget() populates this, runOnFunctions() consumes.
using SplitWorklistTy =
std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>>;
using SplitInstrsTy = std::vector<std::vector<MCInst>>;
/// Instrument the branch in \p Iter located at \p FromFunction + \p From,
/// basic block \p FromBB. The destination of the branch is \p ToFunc +
/// \p ToOffset. \p TargetBB should be non-null if this is a local branch
/// and null if it is a call. Return true on success.
bool instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
bool instrumentOneTarget(SplitWorklistTy &SplitWorklist,
SplitInstrsTy &SplitInstrs,
BinaryBasicBlock::iterator &Iter,
BinaryFunction &FromFunction,
BinaryBasicBlock &FromBB, uint32_t From,
BinaryFunction &ToFunc, BinaryBasicBlock *TargetBB,
uint32_t ToOffset);
uint32_t ToOffset, bool IsLeaf,
FunctionDescription *FuncDesc = nullptr,
uint32_t FromNodeID = 0, uint32_t ToNodeID = 0);
void instrumentExitNode(BinaryContext &BC, BinaryBasicBlock &BB,
BinaryBasicBlock::iterator Iter, bool IsLeaf,
FunctionDescription &FuncDesc, uint32_t Node);
uint32_t getFDSize() const;
/// Create a non-allocatable ELF section with read-only tables necessary for
/// writing the instrumented data profile during program finish. The runtime
/// library needs to open the program executable file and read this data from
/// disk, this is not loaded by the system.
void emitTablesAsELFNote(BinaryContext &BC);
/// Critical edges worklist
/// This worklist keeps track of CFG edges <From-To> that needs to be split.
/// This task is deferred until we finish processing all BBs because we can't
/// modify the CFG while iterating over it. For each edge, \p SplitInstrs
/// stores the list of instrumentation instructions as a vector of MCInsts.
/// instrumentOneTarget() populates this, runOnFunctions() consumes.
std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>> SplitWorklist;
std::vector<std::vector<MCInst>> SplitInstrs;
/// Stores function names, to be emitted to the runtime
std::string StringTable;
/// strtab indices in StringTable for each function name
std::unordered_map<const BinaryFunction *, uint32_t> FuncToStringIdx;
std::vector<CounterDescription> Descriptions;
/// Intra-function control flow
std::vector<FunctionDescription> FunctionDescriptions;
mutable std::shared_timed_mutex FDMutex;
/// Inter-function control flow
std::vector<CallDescription> CallDescriptions;
/// Identify all counters used in runtime while instrumentation is running
std::vector<MCSymbol *> Labels;
std::vector<MCSymbol *> Counters;
};
}

View File

@ -3098,9 +3098,13 @@ void RewriteInstance::emitAndLink() {
if (EFMM->ObjectsLoaded) {
auto Result = OLT->findSymbol(Name, false);
if (cantFail(Result.getAddress()) == 0) {
errs()
<< "BOLT-ERROR: symbol not found required by runtime library: "
<< Name << "\n";
// Resolve to a PLT entry if possible
if (auto *I = BC->getBinaryDataByName(Name + "@PLT"))
return JITSymbol(I->getAddress(), JITSymbolFlags());
errs() << "BOLT-ERROR: symbol not found required by runtime "
"library: "
<< Name << "\n";
exit(1);
}
return Result;