mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-02-14 14:56:47 +00:00
[BOLT] Efficient edge profiling in instrumented mode
Summary: Change our edge profiling technique when using instrumentation to do not instrument every edge. Instead, build the spanning tree for the CFG and omit instrumentation for edges in the spanning tree. Infer the edge count for these edges when writing the profile during run time. The inference works with a bottom-up traversal of the spanning tree and establishes the value of the edge connecting to the parent based on a simple flow equation involving output and input edges, where the only unknown variable is the parent edge. This requires some engineering in the runtime lib to support dynamic allocation for building these graphs at runtime. (cherry picked from FBD17062773)
This commit is contained in:
parent
52786928ff
commit
cc4b2fb614
@ -8,5 +8,7 @@ project(libbolt_rt_project)
|
||||
add_library(bolt_rt STATIC
|
||||
instr.cpp
|
||||
)
|
||||
# Don't let the compiler think it can create calls to standard libs
|
||||
target_compile_options(bolt_rt PRIVATE -ffreestanding)
|
||||
|
||||
install(TARGETS bolt_rt DESTINATION lib)
|
||||
|
@ -17,16 +17,35 @@
|
||||
#include <cstdint>
|
||||
#include <elf.h>
|
||||
|
||||
// All extern declarations here need to be defined by BOLT itself.
|
||||
//#define ENABLE_DEBUG
|
||||
|
||||
#ifdef ENABLE_DEBUG
|
||||
#define DEBUG(X) \
|
||||
{ X; }
|
||||
#else
|
||||
#define DEBUG(X) \
|
||||
{}
|
||||
#endif
|
||||
|
||||
// All extern declarations here need to be defined by BOLT itself
|
||||
|
||||
// Counters inserted by instrumentation, incremented during runtime when
|
||||
// points of interest (locations) in the program are reached.
|
||||
extern uint64_t __bolt_instr_locations[];
|
||||
// Number of counters.
|
||||
extern uint32_t __bolt_instr_num_locs;
|
||||
// Filename to dump data to.
|
||||
// Number of call descriptions
|
||||
extern uint32_t __bolt_instr_num_calls;
|
||||
// Number of function descriptions
|
||||
extern uint32_t __bolt_instr_num_funcs;
|
||||
// Filename to dump data to
|
||||
extern char __bolt_instr_filename[];
|
||||
|
||||
// Anonymous namespace covering everything but our library entry point
|
||||
namespace {
|
||||
|
||||
// We use a stack-allocated buffer for string manipulation in some
|
||||
// places with this size.
|
||||
constexpr uint32_t BufSize = 10240;
|
||||
|
||||
// A location is a function name plus offset. Function name needs to be
|
||||
// retrieved from the string table and is stored as an index to this table.
|
||||
struct Location {
|
||||
@ -34,30 +53,58 @@ struct Location {
|
||||
uint32_t Offset;
|
||||
};
|
||||
|
||||
struct CallDescription {
|
||||
Location From;
|
||||
Location To;
|
||||
uint32_t Counter;
|
||||
};
|
||||
|
||||
// An edge description defines an instrumented edge in the program, fully
|
||||
// identified by where the jump is located and its destination.
|
||||
struct EdgeDescription {
|
||||
Location From;
|
||||
uint32_t FromNode;
|
||||
Location To;
|
||||
uint32_t ToNode;
|
||||
uint32_t Counter;
|
||||
};
|
||||
|
||||
// This description is used for instrumented basic blocks. We only instrument
|
||||
// blocks that are leaves of the spanning tree associated with the CFG. We use
|
||||
// their execution count as a starting point to determine the frequency of the
|
||||
// incoming edge in the spanning tree (the one that is not explictly
|
||||
// instrumented). All other edges (not in the spanning tree) have an associated
|
||||
// counter themselves. We never write a node's counter to the profile, it is
|
||||
// only used for inference of other edges.
|
||||
struct InstrumentedNode {
|
||||
uint32_t Node;
|
||||
uint32_t Counter;
|
||||
};
|
||||
|
||||
// General metadata describing a function - number of CFG nodes, leaf nodes and
|
||||
// edges.
|
||||
struct FunctionDescription {
|
||||
uint32_t NumNodes;
|
||||
InstrumentedNode *LeafNodes;
|
||||
uint32_t NumEdges;
|
||||
EdgeDescription *Edges;
|
||||
};
|
||||
|
||||
// These need to be read from disk. They are generated by BOLT and written to
|
||||
// an ELF note section in the binary itself.
|
||||
struct InstrumentationInfo {
|
||||
EdgeDescription *Descriptions;
|
||||
char *Strings; // String table with function names used in this binary
|
||||
int FileDesc; // File descriptor for the file on disk backing this
|
||||
// information in memory via mmap
|
||||
uint8_t *MMapPtr; // The mmap ptr
|
||||
int MMapSize; // The mmap size
|
||||
CallDescription *CallDescriptions;
|
||||
uint8_t *FuncDescriptions;
|
||||
char *Strings; // String table with function names used in this binary
|
||||
int FileDesc; // File descriptor for the file on disk backing this
|
||||
// information in memory via mmap
|
||||
void *MMapPtr; // The mmap ptr
|
||||
int MMapSize; // The mmap size
|
||||
};
|
||||
|
||||
// Declare some syscall wrappers we use throughout this code to avoid linking
|
||||
// against system libc.
|
||||
static uint64_t
|
||||
__open(const char *pathname,
|
||||
uint64_t flags,
|
||||
uint64_t mode) {
|
||||
uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $2, %%rax\n"
|
||||
@ -68,7 +115,7 @@ __open(const char *pathname,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
|
||||
uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $1, %%rax\n"
|
||||
@ -79,7 +126,7 @@ static uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
|
||||
uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $8, %%rax\n"
|
||||
@ -90,7 +137,7 @@ static uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __close(uint64_t fd) {
|
||||
int __close(uint64_t fd) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $3, %%rax\n"
|
||||
@ -101,8 +148,8 @@ static int __close(uint64_t fd) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
|
||||
uint64_t flags, uint64_t fd, uint64_t offset) {
|
||||
void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
|
||||
uint64_t fd, uint64_t offset) {
|
||||
void *ret;
|
||||
register uint64_t r8 asm("r8") = fd;
|
||||
register uint64_t r9 asm("r9") = offset;
|
||||
@ -116,7 +163,7 @@ static void *__mmap(uint64_t addr, uint64_t size, uint64_t prot,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __munmap(void *addr, uint64_t size) {
|
||||
uint64_t __munmap(void *addr, uint64_t size) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $11, %%rax\n"
|
||||
@ -127,7 +174,7 @@ static uint64_t __munmap(void *addr, uint64_t size) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static uint64_t __exit(uint64_t code) {
|
||||
uint64_t __exit(uint64_t code) {
|
||||
uint64_t ret;
|
||||
__asm__ __volatile__ (
|
||||
"movq $231, %%rax\n"
|
||||
@ -142,9 +189,9 @@ static uint64_t __exit(uint64_t code) {
|
||||
|
||||
// Write number Num using Base to the buffer in OutBuf, returns a pointer to
|
||||
// the end of the string.
|
||||
static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
|
||||
char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) {
|
||||
const char *Chars = "0123456789abcdef";
|
||||
char Buf[20];
|
||||
char Buf[21];
|
||||
char *Ptr = Buf;
|
||||
while (Num) {
|
||||
*Ptr++ = *(Chars + (Num % Base));
|
||||
@ -160,22 +207,147 @@ static char *intToStr(char *OutBuf, uint32_t Num, uint32_t Base) {
|
||||
return OutBuf;
|
||||
}
|
||||
|
||||
// Copy Str to OutBuf, returns a pointer to the end of the copied string.
|
||||
static char *strCopy(char *OutBuf, const char *Str) {
|
||||
while (*Str)
|
||||
// Copy Str to OutBuf, returns a pointer to the end of the copied string
|
||||
char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) {
|
||||
while (*Str) {
|
||||
*OutBuf++ = *Str++;
|
||||
if (--Size <= 0)
|
||||
return OutBuf;
|
||||
}
|
||||
return OutBuf;
|
||||
}
|
||||
|
||||
// Print Msg to STDERR and quits with error code 1.
|
||||
static void reportError(const char *Msg, uint64_t Size) {
|
||||
void memSet(char *Buf, char C, uint32_t Size) {
|
||||
for (int I = 0; I < Size; ++I)
|
||||
*Buf++ = C;
|
||||
}
|
||||
|
||||
uint32_t strLen(const char *Str) {
|
||||
uint32_t Size = 0;
|
||||
while (*Str++)
|
||||
++Size;
|
||||
return Size;
|
||||
}
|
||||
|
||||
// Print Msg to STDERR and quits with error code 1
|
||||
void reportError(const char *Msg, uint64_t Size) {
|
||||
__write(2, Msg, Size);
|
||||
__exit(1);
|
||||
}
|
||||
|
||||
void assert(bool Assertion, const char *Msg) {
|
||||
if (Assertion)
|
||||
return;
|
||||
char Buf[BufSize];
|
||||
char *Ptr = Buf;
|
||||
Ptr = strCopy(Ptr, "Assertion failed: ");
|
||||
Ptr = strCopy(Ptr, Msg, BufSize - 40);
|
||||
Ptr = strCopy(Ptr, "\n");
|
||||
reportError(Buf, Ptr - Buf);
|
||||
}
|
||||
|
||||
void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) {
|
||||
char Buf[BufSize];
|
||||
char *Ptr = Buf;
|
||||
Ptr = strCopy(Ptr, Msg, BufSize - 23);
|
||||
Ptr = intToStr(Ptr, Num, Base);
|
||||
Ptr = strCopy(Ptr, "\n");
|
||||
__write(2, Buf, Ptr - Buf);
|
||||
}
|
||||
|
||||
void report(const char *Msg) {
|
||||
__write(2, Msg, strLen(Msg));
|
||||
}
|
||||
|
||||
inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
|
||||
return (Value + Align - 1) / Align * Align;
|
||||
}
|
||||
|
||||
/// A simple allocator that mmaps a fixed size region and manages this space
|
||||
/// in a stack fashion, meaning you always deallocate the last element that
|
||||
/// was allocated.
|
||||
class BumpPtrAllocator {
|
||||
struct EntryMetadata {
|
||||
uint64_t Magic;
|
||||
uint64_t AllocSize;
|
||||
};
|
||||
public:
|
||||
void *allocate(uintptr_t Size) {
|
||||
if (StackBase == nullptr) {
|
||||
StackBase = reinterpret_cast<uint8_t *>(__mmap(
|
||||
0, MAXSIZE, 0x3 /* PROT_READ | PROT_WRITE*/,
|
||||
0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0));
|
||||
StackSize = 0;
|
||||
}
|
||||
Size = alignTo(Size + sizeof(EntryMetadata), 16);
|
||||
uint8_t * AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
|
||||
auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
|
||||
M->Magic = MAGIC;
|
||||
M->AllocSize = Size;
|
||||
StackSize += Size;
|
||||
assert(StackSize < MAXSIZE, "allocator ran out of memory");
|
||||
return AllocAddress;
|
||||
}
|
||||
|
||||
void deallocate(void *Ptr) {
|
||||
uint8_t MetadataOffset = sizeof(EntryMetadata);
|
||||
auto *M = reinterpret_cast<EntryMetadata *>(
|
||||
reinterpret_cast<uint8_t *>(Ptr) - MetadataOffset);
|
||||
const uint8_t *StackTop = StackBase + StackSize + MetadataOffset;
|
||||
// Validate size
|
||||
if (Ptr != StackTop - M->AllocSize) {
|
||||
// This could be a pointer returned by operator new []
|
||||
MetadataOffset +=
|
||||
sizeof(uint64_t); // Space for number of elements alloc'ed
|
||||
M = reinterpret_cast<EntryMetadata *>(reinterpret_cast<uint8_t *>(Ptr) -
|
||||
MetadataOffset);
|
||||
assert(Ptr == StackTop - M->AllocSize,
|
||||
"must deallocate the last element alloc'ed");
|
||||
}
|
||||
assert(M->Magic == MAGIC, "allocator magic is corrupt");
|
||||
StackSize -= M->AllocSize;
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr uint64_t MAGIC = 0x1122334455667788ull;
|
||||
static constexpr uint64_t MAXSIZE = 0xa00000;
|
||||
uint8_t *StackBase{nullptr};
|
||||
uint64_t StackSize{0};
|
||||
uint8_t *LastAlloc{nullptr};
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// User-defined placement new operators. We only use those (as opposed to
|
||||
// overriding the regular operator new) so we can keep our allocator in the
|
||||
// stack instead of in a data section (global).
|
||||
void *operator new(uintptr_t Sz, BumpPtrAllocator &A) {
|
||||
return A.allocate(Sz);
|
||||
}
|
||||
void *operator new (uintptr_t Sz, BumpPtrAllocator &A, char C) {
|
||||
auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
|
||||
memSet(Ptr, C, Sz);
|
||||
return Ptr;
|
||||
}
|
||||
void *operator new [] (uintptr_t Sz, BumpPtrAllocator &A) {
|
||||
return A.allocate(Sz);
|
||||
}
|
||||
void *operator new [] (uintptr_t Sz, BumpPtrAllocator &A, char C) {
|
||||
auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
|
||||
memSet(Ptr, C, Sz);
|
||||
return Ptr;
|
||||
}
|
||||
// Only called during exception unwinding (useless). We must manually dealloc.
|
||||
// C++ language weirdness
|
||||
void operator delete(void *Ptr, BumpPtrAllocator &A) {
|
||||
A.deallocate(Ptr);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Perform a string comparison and returns zero if Str1 matches Str2. Compares
|
||||
// at most Size characters.
|
||||
static int compareStr(const char *Str1, const char *Str2, int Size) {
|
||||
int compareStr(const char *Str1, const char *Str2, int Size) {
|
||||
while (*Str1 == *Str2) {
|
||||
if (*Str1 == '\0' || --Size == 0)
|
||||
return 0;
|
||||
@ -187,15 +359,19 @@ static int compareStr(const char *Str1, const char *Str2, int Size) {
|
||||
|
||||
// Write as a string in OutBuf an identifier for the program point at function
|
||||
// whose name is in the string table index FuncStrIndex plus Offset.
|
||||
static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
|
||||
const Location Loc) {
|
||||
char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
|
||||
const Location Loc, uint32_t BufSize) {
|
||||
// fdata location format: Type Name Offset
|
||||
// Type 1 - regular symbol
|
||||
OutBuf = strCopy(OutBuf, "1 ");
|
||||
const char *Str = Info.Strings + Loc.FunctionName;
|
||||
uint32_t Size = 25;
|
||||
while (*Str) {
|
||||
*OutBuf++ = *Str++;
|
||||
if (++Size >= BufSize)
|
||||
break;
|
||||
}
|
||||
assert(!*Str, "buffer overflow, function name too large");
|
||||
*OutBuf++ = ' ';
|
||||
OutBuf = intToStr(OutBuf, Loc.Offset, 16);
|
||||
*OutBuf++ = ' ';
|
||||
@ -204,11 +380,12 @@ static char *serializeLoc(const InstrumentationInfo &Info, char *OutBuf,
|
||||
|
||||
// Read and map to memory the descriptions written by BOLT into the executable's
|
||||
// notes section
|
||||
static InstrumentationInfo readDescriptions() {
|
||||
InstrumentationInfo readDescriptions() {
|
||||
InstrumentationInfo Result;
|
||||
uint64_t FD = __open("/proc/self/exe",
|
||||
/*flags=*/0 /*O_RDONLY*/,
|
||||
/*mode=*/0666);
|
||||
assert(static_cast<int64_t>(FD) > 0, "Failed to open /proc/self/exe");
|
||||
Result.FileDesc = FD;
|
||||
|
||||
// mmap our binary to memory
|
||||
@ -232,18 +409,20 @@ static InstrumentationInfo readDescriptions() {
|
||||
continue;
|
||||
}
|
||||
// Actual contents of the ELF note start after offset 20 decimal:
|
||||
// Offset 0: Producer name size (4 bytes)
|
||||
// Offset 4: Contents size (4 bytes)
|
||||
// Offset 8: Note type (4 bytes)
|
||||
// Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
|
||||
// Offset 20: Contents
|
||||
Result.Descriptions =
|
||||
reinterpret_cast<EdgeDescription *>(BinContents + Shdr->sh_offset + 20);
|
||||
// String table is located after the full EdgeDescriptions table containing
|
||||
// __bolt_instr_num_locs entries is finished
|
||||
Result.Strings = reinterpret_cast<char *>(
|
||||
BinContents + Shdr->sh_offset + 20 +
|
||||
(__bolt_instr_num_locs * sizeof(EdgeDescription)));
|
||||
// Offset 0: Producer name size (4 bytes)
|
||||
// Offset 4: Contents size (4 bytes)
|
||||
// Offset 8: Note type (4 bytes)
|
||||
// Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
|
||||
// Offset 20: Contents
|
||||
uint32_t CallDescSize =
|
||||
*reinterpret_cast<uint32_t *>(BinContents + Shdr->sh_offset + 20);
|
||||
uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
|
||||
BinContents + Shdr->sh_offset + 24 + CallDescSize);
|
||||
Result.CallDescriptions =
|
||||
reinterpret_cast<CallDescription *>(BinContents + Shdr->sh_offset + 24);
|
||||
Result.FuncDescriptions = BinContents + Shdr->sh_offset + 28 + CallDescSize;
|
||||
Result.Strings = reinterpret_cast<char *>(BinContents + Shdr->sh_offset +
|
||||
28 + CallDescSize + FuncDescSize);
|
||||
return Result;
|
||||
}
|
||||
const char ErrMsg[] =
|
||||
@ -253,6 +432,419 @@ static InstrumentationInfo readDescriptions() {
|
||||
return Result;
|
||||
}
|
||||
|
||||
void printStats(const InstrumentationInfo &Info) {
|
||||
char StatMsg[BufSize];
|
||||
char *StatPtr = StatMsg;
|
||||
StatPtr = strCopy(
|
||||
StatPtr, "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nCallDescSize: ");
|
||||
StatPtr = intToStr(StatPtr,
|
||||
Info.FuncDescriptions -
|
||||
reinterpret_cast<uint8_t *>(Info.CallDescriptions),
|
||||
10);
|
||||
StatPtr = strCopy(StatPtr, "\nFuncDescSize: ");
|
||||
StatPtr = intToStr(
|
||||
StatPtr,
|
||||
reinterpret_cast<uint8_t *>(Info.Strings) - Info.FuncDescriptions, 10);
|
||||
StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_calls: ");
|
||||
StatPtr = intToStr(StatPtr, __bolt_instr_num_calls, 10);
|
||||
StatPtr = strCopy(StatPtr, "\n__bolt_instr_num_funcs: ");
|
||||
StatPtr = intToStr(StatPtr, __bolt_instr_num_funcs, 10);
|
||||
StatPtr = strCopy(StatPtr, "\n");
|
||||
__write(2, StatMsg, StatPtr - StatMsg);
|
||||
}
|
||||
|
||||
/// This is part of a simple CFG representation in memory, where we store
|
||||
/// a dynamically sized array of input and output edges per node, and store
|
||||
/// a dynamically sized array of nodes per graph. We also store the spanning
|
||||
/// tree edges for that CFG in a separate array of nodes in
|
||||
/// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
|
||||
struct Edge {
|
||||
uint32_t Node; // Index in nodes array regarding the destination of this edge
|
||||
uint32_t ID; // Edge index in an array comprising all edges of the graph
|
||||
};
|
||||
|
||||
/// A regular graph node or a spanning tree node
|
||||
struct Node {
|
||||
uint32_t NumInEdges{0}; // Input edge count used to size InEdge
|
||||
uint32_t NumOutEdges{0}; // Output edge count used to size OutEdges
|
||||
Edge *InEdges{nullptr}; // Created and managed by \p Graph
|
||||
Edge *OutEdges{nullptr}; // ditto
|
||||
};
|
||||
|
||||
/// Main class for CFG representation in memory. Manages object creation and
|
||||
/// destruction, populates an array of CFG nodes as well as corresponding
|
||||
/// spanning tree nodes.
|
||||
struct Graph {
|
||||
uint32_t NumNodes;
|
||||
Node *CFGNodes;
|
||||
Node *SpanningTreeNodes;
|
||||
BumpPtrAllocator &Alloc;
|
||||
|
||||
/// Reads a list of \p NumEdgeDescs descriptions in \p EdgeDescs and builds
|
||||
/// the graph from it. Allocates several internal dynamic structures that are
|
||||
/// later destroyed by ~Graph() and uses \p Alloc. \p LeafNodes contain all
|
||||
/// spanning tree leaf nodes descriptions (their counters). They are the seed
|
||||
/// used to compute the rest of the missing edge counts in a bottom-up
|
||||
/// traversal of the spanning tree.
|
||||
Graph(BumpPtrAllocator &Alloc, const EdgeDescription *EdgeDescs,
|
||||
uint32_t NumEdgeDescs, const InstrumentedNode *LeafNodes,
|
||||
uint32_t NumLeafNodes);
|
||||
|
||||
~Graph();
|
||||
void dump() const;
|
||||
};
|
||||
|
||||
Graph::Graph(BumpPtrAllocator &Alloc, const EdgeDescription *EdgeDescs,
|
||||
uint32_t NumEdgeDescs, const InstrumentedNode *LeafNodes,
|
||||
uint32_t NumLeafNodes) : Alloc(Alloc) {
|
||||
DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
|
||||
// First pass to determine number of nodes
|
||||
uint32_t MaxNodes = 0;
|
||||
for (int I = 0; I < NumEdgeDescs; ++I) {
|
||||
if (EdgeDescs[I].FromNode > MaxNodes)
|
||||
MaxNodes = EdgeDescs[I].FromNode;
|
||||
if (EdgeDescs[I].ToNode > MaxNodes)
|
||||
MaxNodes = EdgeDescs[I].ToNode;
|
||||
}
|
||||
for (int I = 0; I < NumLeafNodes; ++I) {
|
||||
if (LeafNodes[I].Node > MaxNodes)
|
||||
MaxNodes = LeafNodes[I].Node;
|
||||
}
|
||||
// No edges? Nothing to do
|
||||
if (!MaxNodes) {
|
||||
CFGNodes = nullptr;
|
||||
SpanningTreeNodes = nullptr;
|
||||
NumNodes = 0;
|
||||
return;
|
||||
}
|
||||
++MaxNodes;
|
||||
DEBUG(reportNumber("NumNodes = ", MaxNodes, 10));
|
||||
NumNodes = MaxNodes;
|
||||
|
||||
// Initial allocations
|
||||
CFGNodes = new (Alloc) Node[MaxNodes];
|
||||
DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
|
||||
SpanningTreeNodes = new (Alloc) Node[MaxNodes];
|
||||
DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
|
||||
(uint64_t)SpanningTreeNodes, 16));
|
||||
|
||||
// Figure out how much to allocate to each vector (in/out edge sets)
|
||||
for (int I = 0; I < NumEdgeDescs; ++I) {
|
||||
CFGNodes[EdgeDescs[I].FromNode].NumOutEdges++;
|
||||
CFGNodes[EdgeDescs[I].ToNode].NumInEdges++;
|
||||
if (EdgeDescs[I].Counter != 0xffffffff)
|
||||
continue;
|
||||
|
||||
SpanningTreeNodes[EdgeDescs[I].FromNode].NumOutEdges++;
|
||||
SpanningTreeNodes[EdgeDescs[I].ToNode].NumInEdges++;
|
||||
}
|
||||
|
||||
// Allocate in/out edge sets
|
||||
for (int I = 0; I < MaxNodes; ++I) {
|
||||
if (CFGNodes[I].NumInEdges > 0)
|
||||
CFGNodes[I].InEdges = new (Alloc) Edge[CFGNodes[I].NumInEdges];
|
||||
if (CFGNodes[I].NumOutEdges > 0)
|
||||
CFGNodes[I].OutEdges = new (Alloc) Edge[CFGNodes[I].NumOutEdges];
|
||||
if (SpanningTreeNodes[I].NumInEdges > 0)
|
||||
SpanningTreeNodes[I].InEdges =
|
||||
new (Alloc) Edge[SpanningTreeNodes[I].NumInEdges];
|
||||
if (SpanningTreeNodes[I].NumOutEdges > 0)
|
||||
SpanningTreeNodes[I].OutEdges =
|
||||
new (Alloc) Edge[SpanningTreeNodes[I].NumOutEdges];
|
||||
CFGNodes[I].NumInEdges = 0;
|
||||
CFGNodes[I].NumOutEdges = 0;
|
||||
SpanningTreeNodes[I].NumInEdges = 0;
|
||||
SpanningTreeNodes[I].NumOutEdges = 0;
|
||||
}
|
||||
|
||||
// Fill in/out edge sets
|
||||
for (int I = 0; I < NumEdgeDescs; ++I) {
|
||||
const uint32_t Src = EdgeDescs[I].FromNode;
|
||||
const uint32_t Dst = EdgeDescs[I].ToNode;
|
||||
Edge *E = &CFGNodes[Src].OutEdges[CFGNodes[Src].NumOutEdges++];
|
||||
E->Node = Dst;
|
||||
E->ID = I;
|
||||
|
||||
E = &CFGNodes[Dst].InEdges[CFGNodes[Dst].NumInEdges++];
|
||||
E->Node = Src;
|
||||
E->ID = I;
|
||||
|
||||
if (EdgeDescs[I].Counter != 0xffffffff)
|
||||
continue;
|
||||
|
||||
E = &SpanningTreeNodes[Src]
|
||||
.OutEdges[SpanningTreeNodes[Src].NumOutEdges++];
|
||||
E->Node = Dst;
|
||||
E->ID = I;
|
||||
|
||||
E = &SpanningTreeNodes[Dst]
|
||||
.InEdges[SpanningTreeNodes[Dst].NumInEdges++];
|
||||
E->Node = Src;
|
||||
E->ID = I;
|
||||
}
|
||||
}
|
||||
|
||||
Graph::~Graph() {
|
||||
for (int I = NumNodes - 1; I >= 0; --I) {
|
||||
if (SpanningTreeNodes[I].OutEdges)
|
||||
Alloc.deallocate(SpanningTreeNodes[I].OutEdges);
|
||||
if (SpanningTreeNodes[I].InEdges)
|
||||
Alloc.deallocate(SpanningTreeNodes[I].InEdges);
|
||||
if (CFGNodes[I].OutEdges)
|
||||
Alloc.deallocate(CFGNodes[I].OutEdges);
|
||||
if (CFGNodes[I].InEdges)
|
||||
Alloc.deallocate(CFGNodes[I].InEdges);
|
||||
}
|
||||
if (SpanningTreeNodes)
|
||||
Alloc.deallocate(SpanningTreeNodes);
|
||||
if (CFGNodes)
|
||||
Alloc.deallocate(CFGNodes);
|
||||
}
|
||||
|
||||
void Graph::dump() const {
|
||||
reportNumber("Dumping graph with number of nodes: ", NumNodes, 10);
|
||||
report(" Full graph:\n");
|
||||
for (int I = 0; I < NumNodes; ++I) {
|
||||
const Node *N = &CFGNodes[I];
|
||||
reportNumber(" Node #", I, 10);
|
||||
reportNumber(" InEdges total ", N->NumInEdges, 10);
|
||||
for (int J = 0; J < N->NumInEdges; ++J)
|
||||
reportNumber(" ", N->InEdges[J].Node, 10);
|
||||
reportNumber(" OutEdges total ", N->NumOutEdges, 10);
|
||||
for (int J = 0; J < N->NumOutEdges; ++J)
|
||||
reportNumber(" ", N->OutEdges[J].Node, 10);
|
||||
report("\n");
|
||||
}
|
||||
report(" Spanning tree:\n");
|
||||
for (int I = 0; I < NumNodes; ++I) {
|
||||
const Node *N = &SpanningTreeNodes[I];
|
||||
reportNumber(" Node #", I, 10);
|
||||
reportNumber(" InEdges total ", N->NumInEdges, 10);
|
||||
for (int J = 0; J < N->NumInEdges; ++J)
|
||||
reportNumber(" ", N->InEdges[J].Node, 10);
|
||||
reportNumber(" OutEdges total ", N->NumOutEdges, 10);
|
||||
for (int J = 0; J < N->NumOutEdges; ++J)
|
||||
reportNumber(" ", N->OutEdges[J].Node, 10);
|
||||
report("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void dumpEdgeFreqs(const uint64_t *EdgeFreqs, const EdgeDescription *EdgeDescs,
|
||||
uint32_t NumEdges) {
|
||||
reportNumber("Dumping edge frequencies for graph with num edges: ", NumEdges,
|
||||
10);
|
||||
for (int I = 0; I < NumEdges; ++I) {
|
||||
reportNumber("* Src: ", EdgeDescs[I].FromNode, 10);
|
||||
reportNumber(" Dst: ", EdgeDescs[I].ToNode, 10);
|
||||
reportNumber(" Cnt: ", EdgeFreqs[I], 10);
|
||||
}
|
||||
}
|
||||
|
||||
// Return an array with the frequency of each edge in the function represented
|
||||
// by G.
|
||||
uint64_t *computeEdgeFrequencies(BumpPtrAllocator &Alloc, Graph *G,
|
||||
const EdgeDescription *EdgeDescs,
|
||||
uint32_t NumEdges,
|
||||
const InstrumentedNode *LeafNodes,
|
||||
uint32_t NumLeafNodes,
|
||||
const uint64_t *Counters) {
|
||||
if (G->NumNodes == 0 || NumEdges == 0)
|
||||
return 0;
|
||||
assert(NumLeafNodes > 0, "no leaf node frequency");
|
||||
|
||||
uint64_t *EdgeFrequency = new (Alloc, 0) uint64_t [NumEdges];
|
||||
|
||||
// Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
|
||||
// spanning tree don't have explicit counters. We must infer their value using
|
||||
// a linear combination of other counters (sum of counters of the outgoing
|
||||
// edges minus sum of counters of the incoming edges).
|
||||
uint32_t *Stack = new (Alloc) uint32_t [G->NumNodes];
|
||||
uint32_t StackTop = 0;
|
||||
enum Status : uint8_t { S_NEW = 0, S_VISITING, S_VISITED };
|
||||
Status *Visited = new (Alloc, 0) Status[G->NumNodes];
|
||||
uint64_t *LeafFrequency = new (Alloc, 0) uint64_t[G->NumNodes];
|
||||
|
||||
// Setup a fast lookup for frequency of leaf nodes, which have special
|
||||
// basic block frequency instrumentation (they are not edge profiled).
|
||||
uint64_t TotalFreq = 0;
|
||||
for (int I = 0; I < NumLeafNodes; ++I) {
|
||||
LeafFrequency[LeafNodes[I].Node] = Counters[LeafNodes[I].Counter];
|
||||
DEBUG({
|
||||
if (Counters[LeafNodes[I].Counter] > 0) {
|
||||
reportNumber("Leaf Node# ", LeafNodes[I].Node, 10);
|
||||
reportNumber(" Counter: ", Counters[LeafNodes[I].Counter], 10);
|
||||
}
|
||||
});
|
||||
TotalFreq += Counters[LeafNodes[I].Counter];
|
||||
}
|
||||
// Add all root nodes to the stack
|
||||
for (int I = 0; I < G->NumNodes; ++I) {
|
||||
if (G->SpanningTreeNodes[I].NumInEdges == 0)
|
||||
Stack[StackTop++] = I;
|
||||
}
|
||||
// Empty stack?
|
||||
if (StackTop == 0) {
|
||||
Alloc.deallocate(LeafFrequency);
|
||||
Alloc.deallocate(Visited);
|
||||
Alloc.deallocate(Stack);
|
||||
Alloc.deallocate(EdgeFrequency);
|
||||
return 0;
|
||||
}
|
||||
// Add all known edge counts, will infer the rest
|
||||
for (int I = 0; I < NumEdges; ++I) {
|
||||
const uint32_t C = EdgeDescs[I].Counter;
|
||||
if (C == 0xffffffff) // inferred counter - we will compute its value
|
||||
continue;
|
||||
EdgeFrequency[I] = Counters[C];
|
||||
TotalFreq += Counters[C];
|
||||
}
|
||||
|
||||
// This function is completely cold, there is no point in computing anything
|
||||
// since inferred edges will be zero too.
|
||||
#ifndef ENABLE_DEBUG
|
||||
if (TotalFreq == 0) {
|
||||
Alloc.deallocate(LeafFrequency);
|
||||
Alloc.deallocate(Visited);
|
||||
Alloc.deallocate(Stack);
|
||||
return EdgeFrequency;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (StackTop > 0) {
|
||||
const uint32_t Cur = Stack[--StackTop];
|
||||
DEBUG({
|
||||
if (Visited[Cur] == S_VISITING)
|
||||
report("(visiting) ");
|
||||
else
|
||||
report("(new) ");
|
||||
reportNumber("Cur: ", Cur, 10);
|
||||
});
|
||||
|
||||
// This shouldn't happen in a tree
|
||||
assert(Visited[Cur] != S_VISITED, "should not have visited nodes in stack");
|
||||
if (Visited[Cur] == S_NEW) {
|
||||
Visited[Cur] = S_VISITING;
|
||||
Stack[StackTop++] = Cur;
|
||||
assert(StackTop <= G->NumNodes, "stack grew too large");
|
||||
for (int I = 0, E = G->SpanningTreeNodes[Cur].NumOutEdges; I < E; ++I) {
|
||||
const uint32_t Succ = G->SpanningTreeNodes[Cur].OutEdges[I].Node;
|
||||
Stack[StackTop++] = Succ;
|
||||
assert(StackTop <= G->NumNodes, "stack grew too large");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Visited[Cur] = S_VISITED;
|
||||
|
||||
// No parent? Reached a tree root, nothing to do.
|
||||
if (G->SpanningTreeNodes[Cur].NumInEdges == 0)
|
||||
continue;
|
||||
|
||||
assert(G->SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
|
||||
const uint32_t Parent = G->SpanningTreeNodes[Cur].InEdges[0].Node;
|
||||
const uint32_t ParentEdge = G->SpanningTreeNodes[Cur].InEdges[0].ID;
|
||||
|
||||
// Establish our node frequency based on outgoing edges, which should all be
|
||||
// resolved by now.
|
||||
int64_t CurNodeFreq = LeafFrequency[Cur];
|
||||
// Not a leaf?
|
||||
if (!CurNodeFreq) {
|
||||
for (int I = 0, E = G->CFGNodes[Cur].NumOutEdges; I != E; ++I) {
|
||||
const uint32_t SuccEdge = G->CFGNodes[Cur].OutEdges[I].ID;
|
||||
CurNodeFreq += EdgeFrequency[SuccEdge];
|
||||
}
|
||||
}
|
||||
int64_t ParentEdgeFreq = CurNodeFreq;
|
||||
// Calculate parent edge freq.
|
||||
for (int I = 0, E = G->CFGNodes[Cur].NumInEdges; I != E; ++I) {
|
||||
const uint32_t PredEdge = G->CFGNodes[Cur].InEdges[I].ID;
|
||||
ParentEdgeFreq -= EdgeFrequency[PredEdge];
|
||||
}
|
||||
// Sometimes the conservative CFG that BOLT builds will lead to incorrect
|
||||
// flow computation. For example, in a BB that transitively calls the exit
|
||||
// syscall, BOLT will add a fall-through successor even though it should not
|
||||
// have any successors. So this block execution will likely be wrong. We
|
||||
// tolerate this imperfection since this case should be quite infrequent.
|
||||
if (ParentEdgeFreq < 0) {
|
||||
DEBUG(dumpEdgeFreqs(EdgeFrequency, EdgeDescs, NumEdges));
|
||||
DEBUG(report("WARNING: incorrect flow"));
|
||||
ParentEdgeFreq = 0;
|
||||
}
|
||||
DEBUG(reportNumber(" Setting freq for ParentEdge: ", ParentEdge, 10));
|
||||
DEBUG(reportNumber(" with ParentEdgeFreq: ", ParentEdgeFreq, 10));
|
||||
EdgeFrequency[ParentEdge] = ParentEdgeFreq;
|
||||
}
|
||||
|
||||
Alloc.deallocate(LeafFrequency);
|
||||
Alloc.deallocate(Visited);
|
||||
Alloc.deallocate(Stack);
|
||||
return EdgeFrequency;
|
||||
}
|
||||
|
||||
// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
|
||||
// \p Alloc to allocate helper dynamic structures used to compute profile for
|
||||
// edges that we do not explictly instrument.
|
||||
uint8_t *writeFunctionProfile(int FD, const InstrumentationInfo &Info,
|
||||
uint8_t *FuncDesc, BumpPtrAllocator &Alloc) {
|
||||
uint32_t NumLeafNodes = *reinterpret_cast<uint32_t *>(FuncDesc);
|
||||
DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes, 10));
|
||||
InstrumentedNode *LeafNodes =
|
||||
reinterpret_cast<InstrumentedNode *>(FuncDesc + 4);
|
||||
|
||||
uint32_t NumEdges = *reinterpret_cast<uint32_t *>(
|
||||
FuncDesc + 4 + NumLeafNodes * sizeof(InstrumentedNode));
|
||||
DEBUG(reportNumber("NumEdges = ", NumEdges, 10));
|
||||
EdgeDescription *EdgeDescs = reinterpret_cast<EdgeDescription *>(
|
||||
FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode));
|
||||
|
||||
uint8_t *next = (FuncDesc + 8 + NumLeafNodes * sizeof(InstrumentedNode) +
|
||||
NumEdges * sizeof(EdgeDescription));
|
||||
|
||||
// Skip funcs we know are cold
|
||||
#ifndef ENABLE_DEBUG
|
||||
uint64_t LeafFreq = 0;
|
||||
for (int I = 0; I < NumLeafNodes; ++I) {
|
||||
LeafFreq += __bolt_instr_locations[LeafNodes[I].Counter];
|
||||
}
|
||||
if (LeafFreq == 0)
|
||||
return next;
|
||||
#endif
|
||||
|
||||
Graph *G =
|
||||
new (Alloc) Graph(Alloc, EdgeDescs, NumEdges, LeafNodes, NumLeafNodes);
|
||||
DEBUG(G->dump());
|
||||
uint64_t *Freqs =
|
||||
computeEdgeFrequencies(Alloc, G, EdgeDescs, NumEdges, LeafNodes,
|
||||
NumLeafNodes, __bolt_instr_locations);
|
||||
if (!Freqs) {
|
||||
G->~Graph();
|
||||
Alloc.deallocate(G);
|
||||
return next;
|
||||
}
|
||||
|
||||
DEBUG(dumpEdgeFreqs(Freqs, EdgeDescs, NumEdges));
|
||||
for (int I = 0; I < NumEdges; ++I) {
|
||||
const uint64_t Freq = Freqs[I];
|
||||
if (Freq == 0)
|
||||
continue;
|
||||
const EdgeDescription *Desc = &EdgeDescs[I];
|
||||
char LineBuf[BufSize];
|
||||
char *Ptr = LineBuf;
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->From, BufSize);
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
|
||||
Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 22);
|
||||
Ptr = intToStr(Ptr, Freq, 10);
|
||||
*Ptr++ = '\n';
|
||||
__write(FD, LineBuf, Ptr - LineBuf);
|
||||
}
|
||||
|
||||
Alloc.deallocate(Freqs);
|
||||
G->~Graph();
|
||||
Alloc.deallocate(G);
|
||||
return next;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// This is the entry point called at program exit. BOLT patches the executable's
|
||||
// FINI entry in the .dynamic section with the address of this function. Our
|
||||
// goal here is to flush to disk all instrumentation data in memory, using
|
||||
@ -260,25 +852,43 @@ static InstrumentationInfo readDescriptions() {
|
||||
extern "C" void __bolt_instr_data_dump() {
|
||||
const InstrumentationInfo Info = readDescriptions();
|
||||
|
||||
DEBUG(printStats(Info));
|
||||
|
||||
uint64_t FD = __open(__bolt_instr_filename,
|
||||
/*flags=*/0x241 /*O_WRONLY|O_TRUNC|O_CREAT*/,
|
||||
/*mode=*/0666);
|
||||
if (static_cast<int64_t>(FD) < 0) {
|
||||
reportNumber("Assertion error: failed to open profile file for writing. "
|
||||
"Error number: 0x",
|
||||
0 - static_cast<int64_t>(FD), 16);
|
||||
__exit(1);
|
||||
}
|
||||
|
||||
for (int I = 0, E = __bolt_instr_num_locs; I < E; ++I) {
|
||||
char LineBuf[2000];
|
||||
for (int I = 0, E = __bolt_instr_num_calls; I < E; ++I) {
|
||||
char LineBuf[BufSize];
|
||||
char *Ptr = LineBuf;
|
||||
uint32_t HitCount = __bolt_instr_locations[I];
|
||||
|
||||
CallDescription *Desc = &Info.CallDescriptions[I];
|
||||
uint64_t HitCount = __bolt_instr_locations[Desc->Counter];
|
||||
if (!HitCount)
|
||||
continue;
|
||||
|
||||
EdgeDescription *Desc = &Info.Descriptions[I];
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->From);
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->To);
|
||||
Ptr = strCopy(Ptr, "0 ");
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->From, BufSize);
|
||||
Ptr = serializeLoc(Info, Ptr, Desc->To, BufSize - (Ptr - LineBuf));
|
||||
Ptr = strCopy(Ptr, "0 ", BufSize - (Ptr - LineBuf) - 25);
|
||||
Ptr = intToStr(Ptr, HitCount, 10);
|
||||
*Ptr++ = '\n';
|
||||
__write(FD, LineBuf, Ptr - LineBuf);
|
||||
}
|
||||
|
||||
BumpPtrAllocator Alloc;
|
||||
uint8_t *FuncDesc = Info.FuncDescriptions;
|
||||
for (int I = 0, E = __bolt_instr_num_funcs; I < E; ++I) {
|
||||
FuncDesc = writeFunctionProfile(FD, Info, FuncDesc, Alloc);
|
||||
DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
|
||||
}
|
||||
assert(FuncDesc == (void *)Info.Strings,
|
||||
"FuncDesc ptr must be equal to stringtable");
|
||||
|
||||
__close(FD);
|
||||
__munmap(Info.MMapPtr, Info.MMapSize);
|
||||
__close(Info.FileDesc);
|
||||
|
@ -545,6 +545,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
|
||||
std::pair<uint64_t, const MCSymbol *>
|
||||
BinaryContext::duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
|
||||
const MCSymbol *OldLabel) {
|
||||
auto L = scopeLock();
|
||||
unsigned Offset = 0;
|
||||
bool Found = false;
|
||||
for (auto Elmt : JT->Labels) {
|
||||
|
@ -390,6 +390,9 @@ public:
|
||||
|
||||
/// A mutex that is used to control parallel accesses to Ctx
|
||||
mutable std::shared_timed_mutex CtxMutex;
|
||||
std::unique_lock<std::shared_timed_mutex> scopeLock() const {
|
||||
return std::unique_lock<std::shared_timed_mutex>(CtxMutex);
|
||||
}
|
||||
|
||||
std::unique_ptr<DWARFContext> DwCtx;
|
||||
|
||||
|
@ -898,7 +898,7 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
|
||||
}
|
||||
MCSymbol *Result;
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
auto L = BC.scopeLock();
|
||||
Result = BC.Ctx->createTempSymbol();
|
||||
}
|
||||
Labels[Offset] = Result;
|
||||
@ -1767,7 +1767,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
|
||||
} else {
|
||||
MCSymbol *Label;
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
auto L = BC.scopeLock();
|
||||
Label = BC.Ctx->createTempSymbol("FT", true);
|
||||
}
|
||||
InsertBB = addBasicBlock(
|
||||
@ -3304,12 +3304,12 @@ void BinaryFunction::fixBranches() {
|
||||
if (NextBB && NextBB == TSuccessor) {
|
||||
std::swap(TSuccessor, FSuccessor);
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
auto L = BC.scopeLock();
|
||||
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(), Ctx);
|
||||
}
|
||||
BB->swapConditionalSuccessors();
|
||||
} else {
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
auto L = BC.scopeLock();
|
||||
MIB->replaceBranchTarget(*CondBranch, TSuccessor->getLabel(), Ctx);
|
||||
}
|
||||
if (TSuccessor == FSuccessor) {
|
||||
@ -3324,7 +3324,7 @@ void BinaryFunction::fixBranches() {
|
||||
BB->isCold() != TSuccessor->isCold()) {
|
||||
std::swap(TSuccessor, FSuccessor);
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
|
||||
auto L = BC.scopeLock();
|
||||
MIB->reverseBranchCondition(*CondBranch, TSuccessor->getLabel(),
|
||||
Ctx);
|
||||
}
|
||||
@ -3675,7 +3675,8 @@ bool BinaryFunction::checkForAmbiguousJumpTables() {
|
||||
return false;
|
||||
}
|
||||
|
||||
void BinaryFunction::disambiguateJumpTables() {
|
||||
void BinaryFunction::disambiguateJumpTables(
|
||||
MCPlusBuilder::AllocatorIdTy AllocId) {
|
||||
assert((opts::JumpTables != JTS_BASIC && isSimple()) || BC.HasRelocations);
|
||||
SmallPtrSet<JumpTable *, 4> JumpTables;
|
||||
for (auto &BB : BasicBlocks) {
|
||||
@ -3744,10 +3745,13 @@ void BinaryFunction::disambiguateJumpTables() {
|
||||
const MCSymbol *NewJTLabel;
|
||||
std::tie(NewJumpTableID, NewJTLabel) =
|
||||
BC.duplicateJumpTable(*this, JT, Target);
|
||||
BC.MIB->replaceMemOperandDisp(*JTLoadInst, NewJTLabel, BC.Ctx.get());
|
||||
{
|
||||
auto L = BC.scopeLock();
|
||||
BC.MIB->replaceMemOperandDisp(*JTLoadInst, NewJTLabel, BC.Ctx.get());
|
||||
}
|
||||
// We use a unique ID with the high bit set as address for this "injected"
|
||||
// jump table (not originally in the input binary).
|
||||
BC.MIB->setJumpTable(Inst, NewJumpTableID, 0);
|
||||
BC.MIB->setJumpTable(Inst, NewJumpTableID, 0, AllocId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3773,7 +3777,7 @@ BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From,
|
||||
// Create intermediate BB
|
||||
MCSymbol *Tmp;
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> WrLock(BC.CtxMutex);
|
||||
auto L = BC.scopeLock();
|
||||
Tmp = BC.Ctx->createTempSymbol("SplitEdge", true);
|
||||
}
|
||||
// Link new BBs to the original input offset of the From BB, so we can map
|
||||
|
@ -1487,7 +1487,7 @@ public:
|
||||
/// by an indirect branch, e.g.: instrumentation or shrink wrapping. However,
|
||||
/// this is only possible if we are not updating jump tables in place, but are
|
||||
/// writing it to a new location (moving them).
|
||||
void disambiguateJumpTables();
|
||||
void disambiguateJumpTables(MCPlusBuilder::AllocatorIdTy AllocId);
|
||||
|
||||
/// Change \p OrigDest to \p NewDest in the jump table used at the end of
|
||||
/// \p BB. Returns false if \p OrigDest couldn't be find as a valid target
|
||||
|
@ -10,6 +10,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Instrumentation.h"
|
||||
#include "ParallelUtilities.h"
|
||||
#include "Passes/DataflowInfoManager.h"
|
||||
#include "llvm/Support/Options.h"
|
||||
|
||||
@ -35,6 +36,13 @@ cl::opt<bool> InstrumentHotOnly(
|
||||
cl::init(false),
|
||||
cl::Optional,
|
||||
cl::cat(BoltCategory));
|
||||
|
||||
cl::opt<bool> InstrumentCalls(
|
||||
"instrument-calls",
|
||||
cl::desc("record profile for inter-function control flow activity"),
|
||||
cl::init(false),
|
||||
cl::Optional,
|
||||
cl::cat(BoltCategory));
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
@ -51,47 +59,105 @@ uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
|
||||
return Idx;
|
||||
}
|
||||
|
||||
Instrumentation::CounterDescription Instrumentation::createDescription(
|
||||
void Instrumentation::createCallDescription(
|
||||
const BinaryFunction &FromFunction, uint32_t From,
|
||||
const BinaryFunction &ToFunction, uint32_t To) {
|
||||
CounterDescription Res;
|
||||
Res.FromFuncStringIdx = getFunctionNameIndex(FromFunction);
|
||||
Res.FromOffset = From;
|
||||
Res.ToFuncStringIdx = getFunctionNameIndex(ToFunction);
|
||||
Res.ToOffset = To;
|
||||
return Res;
|
||||
CallDescription CD;
|
||||
CD.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
|
||||
CD.FromLoc.Offset = From;
|
||||
CD.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
|
||||
CD.ToLoc.Offset = To;
|
||||
CD.Counter = Counters.size();
|
||||
CallDescriptions.emplace_back(CD);
|
||||
}
|
||||
|
||||
std::vector<MCInst> Instrumentation::createInstrumentationSnippet(
|
||||
BinaryFunction &FromFunction, uint32_t FromOffset, BinaryFunction &ToFunc,
|
||||
uint32_t ToOffset) {
|
||||
Descriptions.emplace_back(
|
||||
createDescription(FromFunction, FromOffset, ToFunc, ToOffset));
|
||||
bool Instrumentation::createEdgeDescription(
|
||||
FunctionDescription &FuncDesc,
|
||||
const BinaryFunction &FromFunction, uint32_t From,
|
||||
uint32_t FromNodeID,
|
||||
const BinaryFunction &ToFunction, uint32_t To,
|
||||
uint32_t ToNodeID, bool Instrumented) {
|
||||
EdgeDescription ED;
|
||||
auto Result = FuncDesc.EdgesSet.insert(std::make_pair(FromNodeID, ToNodeID));
|
||||
// Avoid creating duplicated edge descriptions. This happens in CFGs where a
|
||||
// block jumps to its fall-through.
|
||||
if (Result.second == false)
|
||||
return false;
|
||||
ED.FromLoc.FuncString = getFunctionNameIndex(FromFunction);
|
||||
ED.FromLoc.Offset = From;
|
||||
ED.FromNode = FromNodeID;
|
||||
ED.ToLoc.FuncString = getFunctionNameIndex(ToFunction);
|
||||
ED.ToLoc.Offset = To;
|
||||
ED.ToNode = ToNodeID;
|
||||
ED.Counter = Instrumented ? Counters.size() : 0xffffffff;
|
||||
FuncDesc.Edges.emplace_back(ED);
|
||||
return true;
|
||||
}
|
||||
|
||||
BinaryContext &BC = FromFunction.getBinaryContext();
|
||||
MCSymbol *Label =
|
||||
BC.Ctx->createTempSymbol("InstrEntry", true);
|
||||
Labels.emplace_back(Label);
|
||||
void Instrumentation::createExitNodeDescription(FunctionDescription &FuncDesc,
|
||||
uint32_t Node) {
|
||||
InstrumentedNode IN;
|
||||
IN.Node = Node;
|
||||
IN.Counter = Counters.size();
|
||||
FuncDesc.ExitNodes.emplace_back(IN);
|
||||
}
|
||||
|
||||
std::vector<MCInst>
|
||||
Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) {
|
||||
auto L = BC.scopeLock();
|
||||
MCSymbol *Label;
|
||||
Label = BC.Ctx->createTempSymbol("InstrEntry", true);
|
||||
Counters.emplace_back(Label);
|
||||
std::vector<MCInst> CounterInstrs(5);
|
||||
// Don't clobber application red zone (ABI dependent)
|
||||
BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
|
||||
/*NoFlagsClobber=*/true);
|
||||
if (IsLeaf)
|
||||
BC.MIB->createStackPointerIncrement(CounterInstrs[0], 128,
|
||||
/*NoFlagsClobber=*/true);
|
||||
BC.MIB->createPushFlags(CounterInstrs[1], 2);
|
||||
BC.MIB->createIncMemory(CounterInstrs[2], Label, &*BC.Ctx);
|
||||
BC.MIB->createPopFlags(CounterInstrs[3], 2);
|
||||
BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
|
||||
/*NoFlagsClobber=*/true);
|
||||
if (IsLeaf)
|
||||
BC.MIB->createStackPointerDecrement(CounterInstrs[4], 128,
|
||||
/*NoFlagsClobber=*/true);
|
||||
return CounterInstrs;
|
||||
}
|
||||
|
||||
bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
|
||||
BinaryFunction &FromFunction,
|
||||
BinaryBasicBlock &FromBB,
|
||||
uint32_t From, BinaryFunction &ToFunc,
|
||||
BinaryBasicBlock *TargetBB,
|
||||
uint32_t ToOffset) {
|
||||
void Instrumentation::instrumentExitNode(BinaryContext &BC,
|
||||
BinaryBasicBlock &BB,
|
||||
BinaryBasicBlock::iterator Iter,
|
||||
bool IsLeaf,
|
||||
FunctionDescription &FuncDesc,
|
||||
uint32_t Node) {
|
||||
createExitNodeDescription(FuncDesc, Node);
|
||||
std::vector<MCInst> CounterInstrs = createInstrumentationSnippet(BC, IsLeaf);
|
||||
|
||||
for (auto &NewInst : CounterInstrs) {
|
||||
Iter = BB.insertInstruction(Iter, NewInst);
|
||||
++Iter;
|
||||
}
|
||||
}
|
||||
|
||||
bool Instrumentation::instrumentOneTarget(
|
||||
SplitWorklistTy &SplitWorklist, SplitInstrsTy &SplitInstrs,
|
||||
BinaryBasicBlock::iterator &Iter, BinaryFunction &FromFunction,
|
||||
BinaryBasicBlock &FromBB, uint32_t From, BinaryFunction &ToFunc,
|
||||
BinaryBasicBlock *TargetBB, uint32_t ToOffset, bool IsLeaf,
|
||||
FunctionDescription *FuncDesc, uint32_t FromNodeID, uint32_t ToNodeID) {
|
||||
{
|
||||
auto L = FromFunction.getBinaryContext().scopeLock();
|
||||
bool Created{true};
|
||||
if (!TargetBB)
|
||||
createCallDescription(FromFunction, From, ToFunc, ToOffset);
|
||||
else
|
||||
Created = createEdgeDescription(*FuncDesc, FromFunction, From, FromNodeID,
|
||||
ToFunc, ToOffset, ToNodeID,
|
||||
/*Instrumented=*/true);
|
||||
if (!Created)
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<MCInst> CounterInstrs =
|
||||
createInstrumentationSnippet(FromFunction, From, ToFunc, ToOffset);
|
||||
createInstrumentationSnippet(FromFunction.getBinaryContext(), IsLeaf);
|
||||
|
||||
BinaryContext &BC = FromFunction.getBinaryContext();
|
||||
const MCInst &Inst = *Iter;
|
||||
@ -103,7 +169,7 @@ bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!TargetBB)
|
||||
if (!TargetBB || !FuncDesc)
|
||||
return false;
|
||||
|
||||
// Indirect branch, conditional branches or fall-throughs
|
||||
@ -130,6 +196,199 @@ bool Instrumentation::instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
|
||||
return true;
|
||||
}
|
||||
|
||||
void Instrumentation::instrumentFunction(BinaryContext &BC,
|
||||
BinaryFunction &Function,
|
||||
MCPlusBuilder::AllocatorIdTy AllocId) {
|
||||
SplitWorklistTy SplitWorklist;
|
||||
SplitInstrsTy SplitInstrs;
|
||||
|
||||
FunctionDescription *FuncDesc = nullptr;
|
||||
{
|
||||
std::unique_lock<std::shared_timed_mutex> L(FDMutex);
|
||||
FunctionDescriptions.emplace_back();
|
||||
FuncDesc = &FunctionDescriptions.back();
|
||||
}
|
||||
|
||||
Function.disambiguateJumpTables(AllocId);
|
||||
|
||||
std::unordered_map<const BinaryBasicBlock *, uint32_t> BBToID;
|
||||
uint32_t Id = 0;
|
||||
for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
|
||||
BBToID[&*BBI] = Id++;
|
||||
}
|
||||
std::unordered_set<const BinaryBasicBlock *> VisitedSet;
|
||||
// DFS to establish edges we will use for a spanning tree. Edges in the
|
||||
// spanning tree can be instrumentation-free since their count can be
|
||||
// inferred by solving flow equations on a bottom-up traversal of the tree.
|
||||
// Exit basic blocks are always instrumented so we start the traversal with
|
||||
// a minimum number of defined variables to make the equation solvable.
|
||||
std::stack<std::pair<const BinaryBasicBlock *, BinaryBasicBlock *>> Stack;
|
||||
std::unordered_map<const BinaryBasicBlock *,
|
||||
std::set<const BinaryBasicBlock *>>
|
||||
STOutSet;
|
||||
for (auto BBI = Function.layout_rbegin(); BBI != Function.layout_rend();
|
||||
++BBI) {
|
||||
if ((*BBI)->isEntryPoint())
|
||||
Stack.push(std::make_pair(nullptr, *BBI));
|
||||
}
|
||||
|
||||
// Modified version of BinaryFunction::dfs() to build a spanning tree
|
||||
while (!Stack.empty()) {
|
||||
BinaryBasicBlock *BB;
|
||||
const BinaryBasicBlock *Pred;
|
||||
std::tie(Pred, BB) = Stack.top();
|
||||
Stack.pop();
|
||||
if (VisitedSet.find(BB) != VisitedSet.end())
|
||||
continue;
|
||||
|
||||
VisitedSet.insert(BB);
|
||||
if (Pred)
|
||||
STOutSet[Pred].insert(BB);
|
||||
|
||||
for (auto *SuccBB : BB->landing_pads())
|
||||
Stack.push(std::make_pair(BB, SuccBB));
|
||||
|
||||
for (auto *SuccBB : BB->successors())
|
||||
Stack.push(std::make_pair(BB, SuccBB));
|
||||
}
|
||||
|
||||
// Determine whether this is a leaf function, which needs special
|
||||
// instructions to protect the red zone
|
||||
bool IsLeafFunction{true};
|
||||
for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
|
||||
for (auto I = BBI->begin(), E = BBI->end(); I != E; ++I) {
|
||||
if (BC.MIB->isCall(*I)) {
|
||||
IsLeafFunction = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!IsLeafFunction)
|
||||
break;
|
||||
}
|
||||
|
||||
for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
|
||||
auto &BB{*BBI};
|
||||
bool HasUnconditionalBranch{false};
|
||||
bool HasJumpTable{false};
|
||||
|
||||
for (auto I = BB.begin(); I != BB.end(); ++I) {
|
||||
const auto &Inst = *I;
|
||||
if (!BC.MIB->hasAnnotation(Inst, "Offset"))
|
||||
continue;
|
||||
|
||||
const bool IsJumpTable = Function.getJumpTable(Inst);
|
||||
if (IsJumpTable)
|
||||
HasJumpTable = true;
|
||||
else if (BC.MIB->isUnconditionalBranch(Inst))
|
||||
HasUnconditionalBranch = true;
|
||||
else if ((!BC.MIB->isCall(Inst) && !BC.MIB->isConditionalBranch(Inst)) ||
|
||||
BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
|
||||
continue;
|
||||
|
||||
uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
|
||||
const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
|
||||
BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
|
||||
uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
|
||||
BinaryFunction *TargetFunc =
|
||||
TargetBB ? &Function : BC.getFunctionForSymbol(Target);
|
||||
// Should be null for indirect branches/calls
|
||||
if (TargetFunc && !TargetBB) {
|
||||
if (opts::InstrumentCalls)
|
||||
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
|
||||
FromOffset, *TargetFunc, TargetBB, ToOffset,
|
||||
IsLeafFunction);
|
||||
continue;
|
||||
}
|
||||
if (TargetFunc) {
|
||||
// Do not instrument edges in the spanning tree
|
||||
if (STOutSet[&BB].find(TargetBB) != STOutSet[&BB].end()) {
|
||||
auto L = BC.scopeLock();
|
||||
createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
|
||||
Function, ToOffset, BBToID[TargetBB],
|
||||
/*Instrumented=*/false);
|
||||
continue;
|
||||
}
|
||||
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
|
||||
FromOffset, *TargetFunc, TargetBB, ToOffset,
|
||||
IsLeafFunction, FuncDesc, BBToID[&BB],
|
||||
BBToID[TargetBB]);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (IsJumpTable) {
|
||||
for (auto &Succ : BB.successors()) {
|
||||
// Do not instrument edges in the spanning tree
|
||||
if (STOutSet[&BB].find(&*Succ) != STOutSet[&BB].end()) {
|
||||
auto L = BC.scopeLock();
|
||||
createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
|
||||
Function, Succ->getInputOffset(),
|
||||
BBToID[&*Succ], /*Instrumented=*/false);
|
||||
continue;
|
||||
}
|
||||
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
|
||||
FromOffset, Function, &*Succ,
|
||||
Succ->getInputOffset(), IsLeafFunction, FuncDesc,
|
||||
BBToID[&BB], BBToID[&*Succ]);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// FIXME: handle indirect calls
|
||||
} // End of instructions loop
|
||||
|
||||
// Instrument fallthroughs (when the direct jump instruction is missing)
|
||||
if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
|
||||
BB.size() > 0) {
|
||||
auto *FTBB = BB.getFallthrough();
|
||||
assert(FTBB && "expected valid fall-through basic block");
|
||||
auto I = BB.begin();
|
||||
auto LastInstr = BB.end();
|
||||
--LastInstr;
|
||||
while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
|
||||
--LastInstr;
|
||||
uint32_t FromOffset = 0;
|
||||
// The last instruction in the BB should have an annotation, except
|
||||
// if it was branching to the end of the function as a result of
|
||||
// __builtin_unreachable(), in which case it was deleted by fixBranches.
|
||||
// Ignore this case. FIXME: force fixBranches() to preserve the offset.
|
||||
if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
|
||||
continue;
|
||||
FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
|
||||
|
||||
// Do not instrument edges in the spanning tree
|
||||
if (STOutSet[&BB].find(FTBB) != STOutSet[&BB].end()) {
|
||||
auto L = BC.scopeLock();
|
||||
createEdgeDescription(*FuncDesc, Function, FromOffset, BBToID[&BB],
|
||||
Function, FTBB->getInputOffset(), BBToID[FTBB],
|
||||
/*Instrumented=*/false);
|
||||
continue;
|
||||
}
|
||||
instrumentOneTarget(SplitWorklist, SplitInstrs, I, Function, BB,
|
||||
FromOffset, Function, FTBB, FTBB->getInputOffset(),
|
||||
IsLeafFunction, FuncDesc, BBToID[&BB], BBToID[FTBB]);
|
||||
}
|
||||
} // End of BBs loop
|
||||
|
||||
// Instrument spanning tree leaves
|
||||
for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
|
||||
auto &BB{*BBI};
|
||||
if (STOutSet[&BB].size() == 0 && BB.size() > 0)
|
||||
instrumentExitNode(BC, BB, BB.begin(), IsLeafFunction, *FuncDesc,
|
||||
BBToID[&BB]);
|
||||
}
|
||||
|
||||
// Consume list of critical edges: split them and add instrumentation to the
|
||||
// newly created BBs
|
||||
auto Iter = SplitInstrs.begin();
|
||||
for (auto &BBPair : SplitWorklist) {
|
||||
auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
|
||||
NewBB->addInstructions(Iter->begin(), Iter->end());
|
||||
++Iter;
|
||||
}
|
||||
|
||||
// Unused now
|
||||
FuncDesc->EdgesSet.clear();
|
||||
}
|
||||
|
||||
void Instrumentation::runOnFunctions(BinaryContext &BC) {
|
||||
if (!BC.isX86())
|
||||
return;
|
||||
@ -146,100 +405,28 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
|
||||
/*Alignment=*/1,
|
||||
/*IsReadOnly=*/true, ELF::SHT_NOTE);
|
||||
|
||||
uint64_t InstrumentationSites{0ULL};
|
||||
uint64_t InstrumentationSitesSavingFlags{0ULL};
|
||||
for (auto &BFI : BC.getBinaryFunctions()) {
|
||||
BinaryFunction &Function = BFI.second;
|
||||
if (!Function.isSimple() || !opts::shouldProcess(Function)
|
||||
|| (opts::InstrumentHotOnly && !Function.getKnownExecutionCount()))
|
||||
continue;
|
||||
Function.disambiguateJumpTables();
|
||||
SplitWorklist.clear();
|
||||
SplitInstrs.clear();
|
||||
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
|
||||
return (!BF.isSimple() || !opts::shouldProcess(BF) ||
|
||||
(opts::InstrumentHotOnly && !BF.getKnownExecutionCount()));
|
||||
};
|
||||
|
||||
for (auto BBI = Function.begin(); BBI != Function.end(); ++BBI) {
|
||||
auto &BB{*BBI};
|
||||
bool HasUnconditionalBranch{false};
|
||||
bool HasJumpTable{false};
|
||||
ParallelUtilities::WorkFuncWithAllocTy WorkFun =
|
||||
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
|
||||
instrumentFunction(BC, BF, AllocatorId);
|
||||
};
|
||||
|
||||
for (auto I = BB.begin(); I != BB.end(); ++I) {
|
||||
const auto &Inst = *I;
|
||||
if (!BC.MIB->hasAnnotation(Inst, "Offset"))
|
||||
continue;
|
||||
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
|
||||
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFun,
|
||||
SkipPredicate, "instrumentation", /* ForceSequential=*/true);
|
||||
}
|
||||
|
||||
const bool IsJumpTable = Function.getJumpTable(Inst);
|
||||
if (IsJumpTable)
|
||||
HasJumpTable = true;
|
||||
else if (BC.MIB->isUnconditionalBranch(Inst))
|
||||
HasUnconditionalBranch = true;
|
||||
else if ((!BC.MIB->isCall(Inst) &&
|
||||
!BC.MIB->isConditionalBranch(Inst)) ||
|
||||
BC.MIB->isUnsupportedBranch(Inst.getOpcode()))
|
||||
continue;
|
||||
|
||||
uint32_t FromOffset = BC.MIB->getAnnotationAs<uint32_t>(Inst, "Offset");
|
||||
const MCSymbol *Target = BC.MIB->getTargetSymbol(Inst);
|
||||
BinaryBasicBlock *TargetBB = Function.getBasicBlockForLabel(Target);
|
||||
uint32_t ToOffset = TargetBB ? TargetBB->getInputOffset() : 0;
|
||||
BinaryFunction *TargetFunc =
|
||||
TargetBB ? &Function : BC.getFunctionForSymbol(Target);
|
||||
// Should be null for indirect branches/calls
|
||||
if (TargetFunc) {
|
||||
if (instrumentOneTarget(I, Function, BB, FromOffset, *TargetFunc,
|
||||
TargetBB, ToOffset))
|
||||
++InstrumentationSites;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (IsJumpTable) {
|
||||
for (auto &Succ : BB.successors()) {
|
||||
if (instrumentOneTarget(I, Function, BB, FromOffset, Function,
|
||||
&*Succ, Succ->getInputOffset()))
|
||||
++InstrumentationSites;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// FIXME: handle indirect calls
|
||||
} // End of instructions loop
|
||||
|
||||
// Instrument fallthroughs (when the direct jump instruction is missing)
|
||||
if (!HasUnconditionalBranch && !HasJumpTable && BB.succ_size() > 0 &&
|
||||
BB.size() > 0) {
|
||||
auto *FTBB = BB.getFallthrough();
|
||||
assert(FTBB && "expected valid fall-through basic block");
|
||||
auto I = BB.begin();
|
||||
auto LastInstr = BB.end();
|
||||
--LastInstr;
|
||||
while (LastInstr != I && BC.MIB->isPseudo(*LastInstr))
|
||||
--LastInstr;
|
||||
uint32_t FromOffset = 0;
|
||||
// The last instruction in the BB should have an annotation, except
|
||||
// if it was branching to the end of the function as a result of
|
||||
// __builtin_unreachable(), in which case it was deleted by fixBranches.
|
||||
// Ignore this case. FIXME: force fixBranches() to preserve the offset.
|
||||
if (!BC.MIB->hasAnnotation(*LastInstr, "Offset"))
|
||||
continue;
|
||||
|
||||
FromOffset = BC.MIB->getAnnotationAs<uint32_t>(*LastInstr, "Offset");
|
||||
if (instrumentOneTarget(I, Function, BB, FromOffset, Function, FTBB,
|
||||
FTBB->getInputOffset()))
|
||||
++InstrumentationSites;
|
||||
}
|
||||
} // End of BBs loop
|
||||
|
||||
// Consume list of critical edges: split them and add instrumentation to the
|
||||
// newly created BBs
|
||||
auto Iter = SplitInstrs.begin();
|
||||
for (auto &BBPair : SplitWorklist) {
|
||||
auto *NewBB = Function.splitEdge(BBPair.first, BBPair.second);
|
||||
NewBB->addInstructions(Iter->begin(), Iter->end());
|
||||
++Iter;
|
||||
}
|
||||
uint32_t Instrumentation::getFDSize() const {
|
||||
uint32_t FuncDescSize = 0;
|
||||
for (const auto &Func : FunctionDescriptions) {
|
||||
FuncDescSize += 8 + Func.Edges.size() * sizeof(EdgeDescription) +
|
||||
Func.ExitNodes.size() * sizeof(InstrumentedNode);
|
||||
}
|
||||
|
||||
outs() << "BOLT-INSTRUMENTER: Instrumented " << InstrumentationSites
|
||||
<< " sites, " << InstrumentationSitesSavingFlags << " saving flags.\n";
|
||||
return FuncDescSize;
|
||||
}
|
||||
|
||||
void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
@ -247,12 +434,36 @@ void Instrumentation::emitTablesAsELFNote(BinaryContext &BC) {
|
||||
raw_string_ostream OS(TablesStr);
|
||||
|
||||
// Start of the vector with descriptions (one CounterDescription for each
|
||||
// counter), vector size is Labels.size() CounterDescription-sized elmts
|
||||
for (const auto &Desc : Descriptions) {
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromFuncStringIdx), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromOffset), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.ToFuncStringIdx), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.ToOffset), 4);
|
||||
// counter), vector size is Counters.size() CounterDescription-sized elmts
|
||||
const auto CDSize = CallDescriptions.size() * sizeof(CallDescription);
|
||||
OS.write(reinterpret_cast<const char *>(&CDSize), 4);
|
||||
for (const auto &Desc : CallDescriptions) {
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.FuncString), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.FromLoc.Offset), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.ToLoc.FuncString), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.ToLoc.Offset), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Desc.Counter), 4);
|
||||
}
|
||||
const auto FDSize = getFDSize();
|
||||
OS.write(reinterpret_cast<const char *>(&FDSize), 4);
|
||||
for (const auto &Desc : FunctionDescriptions) {
|
||||
const auto ExitsNum = Desc.ExitNodes.size();
|
||||
OS.write(reinterpret_cast<const char *>(&ExitsNum), 4);
|
||||
for (const auto &ExitNode : Desc.ExitNodes) {
|
||||
OS.write(reinterpret_cast<const char *>(&ExitNode.Node), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&ExitNode.Counter), 4);
|
||||
}
|
||||
const auto EdgesNum = Desc.Edges.size();
|
||||
OS.write(reinterpret_cast<const char *>(&EdgesNum), 4);
|
||||
for (const auto &Edge : Desc.Edges) {
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.FromLoc.FuncString), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.FromLoc.Offset), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.FromNode), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.ToLoc.FuncString), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.ToLoc.Offset), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.ToNode), 4);
|
||||
OS.write(reinterpret_cast<const char *>(&Edge.Counter), 4);
|
||||
}
|
||||
}
|
||||
// Our string table lives immediately after descriptions vector
|
||||
OS << StringTable;
|
||||
@ -278,9 +489,10 @@ void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
|
||||
// All of the following symbols will be exported as globals to be used by the
|
||||
// instrumentation runtime library to dump the instrumentation data to disk.
|
||||
// Label marking start of the memory region containing instrumentation
|
||||
// counters, total vector size is Labels.size() 8-byte counters
|
||||
// counters, total vector size is Counters.size() 8-byte counters
|
||||
MCSymbol *Locs = BC.Ctx->getOrCreateSymbol("__bolt_instr_locations");
|
||||
MCSymbol *NumLocs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_locs");
|
||||
MCSymbol *NumCalls = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_calls");
|
||||
MCSymbol *NumFuncs = BC.Ctx->getOrCreateSymbol("__bolt_instr_num_funcs");
|
||||
/// File name where profile is going to written to after target binary
|
||||
/// finishes a run
|
||||
MCSymbol *FilenameSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_filename");
|
||||
@ -289,23 +501,36 @@ void Instrumentation::emit(BinaryContext &BC, MCStreamer &Streamer) {
|
||||
Streamer.EmitLabel(Locs);
|
||||
Streamer.EmitSymbolAttribute(Locs,
|
||||
MCSymbolAttr::MCSA_Global);
|
||||
for (const auto &Label : Labels) {
|
||||
for (const auto &Label : Counters) {
|
||||
Streamer.EmitLabel(Label);
|
||||
Streamer.emitFill(8, 0);
|
||||
}
|
||||
Streamer.EmitLabel(NumLocs);
|
||||
Streamer.EmitSymbolAttribute(NumLocs,
|
||||
Streamer.EmitLabel(NumCalls);
|
||||
Streamer.EmitSymbolAttribute(NumCalls,
|
||||
MCSymbolAttr::MCSA_Global);
|
||||
Streamer.EmitIntValue(Labels.size(), /*Size=*/4);
|
||||
Streamer.EmitIntValue(CallDescriptions.size(), /*Size=*/4);
|
||||
Streamer.EmitLabel(NumFuncs);
|
||||
Streamer.EmitSymbolAttribute(NumFuncs,
|
||||
MCSymbolAttr::MCSA_Global);
|
||||
Streamer.EmitIntValue(FunctionDescriptions.size(), /*Size=*/4);
|
||||
Streamer.EmitLabel(FilenameSym);
|
||||
Streamer.EmitBytes(opts::InstrumentationFilename);
|
||||
Streamer.emitFill(1, 0);
|
||||
|
||||
uint32_t FuncDescSize = getFDSize();
|
||||
outs() << "BOLT-INSTRUMENTER: Number of call descriptors: "
|
||||
<< CallDescriptions.size() << "\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Number of function descriptors: "
|
||||
<< FunctionDescriptions.size() << "\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Number of counters: " << Counters.size()
|
||||
<< "\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Total size of counters: "
|
||||
<< (Labels.size() * 8) << " bytes (static alloc memory)\n";
|
||||
<< (Counters.size() * 8) << " bytes (static alloc memory)\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Total size of string table emitted: "
|
||||
<< StringTable.size() << " bytes in file\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Total size of descriptors: "
|
||||
<< (Labels.size() * 16) << " bytes in file\n";
|
||||
<< (FuncDescSize + CallDescriptions.size() * sizeof(CallDescription))
|
||||
<< " bytes in file\n";
|
||||
outs() << "BOLT-INSTRUMENTER: Profile will be saved to file "
|
||||
<< opts::InstrumentationFilename << "\n";
|
||||
}
|
||||
|
@ -54,14 +54,42 @@ public:
|
||||
void emit(BinaryContext &BC, MCStreamer &Streamer);
|
||||
|
||||
private:
|
||||
// Instrumented branch location information
|
||||
struct CounterDescription {
|
||||
uint32_t FromFuncStringIdx;
|
||||
uint32_t FromOffset;
|
||||
uint32_t ToFuncStringIdx;
|
||||
uint32_t ToOffset;
|
||||
// Location information -- this is a location in the program binary
|
||||
struct LocDescription {
|
||||
uint32_t FuncString;
|
||||
uint32_t Offset;
|
||||
};
|
||||
|
||||
// Inter-function control flow transfer instrumentation
|
||||
struct CallDescription {
|
||||
LocDescription FromLoc;
|
||||
LocDescription ToLoc;
|
||||
uint32_t Counter;
|
||||
};
|
||||
|
||||
// Intra-function control flow transfer instrumentation
|
||||
struct EdgeDescription {
|
||||
LocDescription FromLoc;
|
||||
uint32_t FromNode;
|
||||
LocDescription ToLoc;
|
||||
uint32_t ToNode;
|
||||
uint32_t Counter;
|
||||
};
|
||||
|
||||
struct InstrumentedNode {
|
||||
uint32_t Node;
|
||||
uint32_t Counter;
|
||||
};
|
||||
|
||||
struct FunctionDescription {
|
||||
std::vector<InstrumentedNode> ExitNodes;
|
||||
std::vector<EdgeDescription> Edges;
|
||||
DenseSet<std::pair<uint32_t, uint32_t>> EdgesSet;
|
||||
};
|
||||
|
||||
void instrumentFunction(BinaryContext &BC, BinaryFunction &Function,
|
||||
MCPlusBuilder::AllocatorIdTy = 0);
|
||||
|
||||
/// Retrieve the string table index for the name of \p Function. We encode
|
||||
/// instrumented locations descriptions with the aid of a string table to
|
||||
/// manage memory of the instrumentation runtime in a more efficient way.
|
||||
@ -73,53 +101,69 @@ private:
|
||||
/// branch source location in terms of function name plus offset, as well as
|
||||
/// branch destination (also name + offset). This will be encoded in the
|
||||
/// binary as static data and function name strings will reference a strtab.
|
||||
CounterDescription createDescription(const BinaryFunction &FromFunction,
|
||||
uint32_t From,
|
||||
const BinaryFunction &ToFunction,
|
||||
uint32_t To);
|
||||
|
||||
void createCallDescription(const BinaryFunction &FromFunction, uint32_t From,
|
||||
const BinaryFunction &ToFunction, uint32_t To);
|
||||
bool createEdgeDescription(FunctionDescription &FuncDesc,
|
||||
const BinaryFunction &FromFunction, uint32_t From,
|
||||
uint32_t FromNodeID,
|
||||
const BinaryFunction &ToFunction, uint32_t To,
|
||||
uint32_t ToNodeID, bool Instrumented);
|
||||
void createExitNodeDescription(FunctionDescription &FuncDesc, uint32_t Node);
|
||||
|
||||
/// Create the sequence of instructions to instrument a branch happening
|
||||
/// at \p FromFunction + \p FromOffset to \p ToFunc + \p ToOffset
|
||||
std::vector<MCInst> createInstrumentationSnippet(BinaryFunction &FromFunction,
|
||||
uint32_t FromOffset,
|
||||
BinaryFunction &ToFunc,
|
||||
uint32_t ToOffset);
|
||||
std::vector<MCInst> createInstrumentationSnippet(BinaryContext &BC,
|
||||
bool IsLeaf);
|
||||
|
||||
// Critical edges worklist
|
||||
// This worklist keeps track of CFG edges <From-To> that needs to be split.
|
||||
// This task is deferred until we finish processing all BBs because we can't
|
||||
// modify the CFG while iterating over it. For each edge, \p SplitInstrsTy
|
||||
// stores the list of instrumentation instructions as a vector of MCInsts.
|
||||
// instrumentOneTarget() populates this, runOnFunctions() consumes.
|
||||
using SplitWorklistTy =
|
||||
std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>>;
|
||||
using SplitInstrsTy = std::vector<std::vector<MCInst>>;
|
||||
|
||||
/// Instrument the branch in \p Iter located at \p FromFunction + \p From,
|
||||
/// basic block \p FromBB. The destination of the branch is \p ToFunc +
|
||||
/// \p ToOffset. \p TargetBB should be non-null if this is a local branch
|
||||
/// and null if it is a call. Return true on success.
|
||||
bool instrumentOneTarget(BinaryBasicBlock::iterator &Iter,
|
||||
bool instrumentOneTarget(SplitWorklistTy &SplitWorklist,
|
||||
SplitInstrsTy &SplitInstrs,
|
||||
BinaryBasicBlock::iterator &Iter,
|
||||
BinaryFunction &FromFunction,
|
||||
BinaryBasicBlock &FromBB, uint32_t From,
|
||||
BinaryFunction &ToFunc, BinaryBasicBlock *TargetBB,
|
||||
uint32_t ToOffset);
|
||||
uint32_t ToOffset, bool IsLeaf,
|
||||
FunctionDescription *FuncDesc = nullptr,
|
||||
uint32_t FromNodeID = 0, uint32_t ToNodeID = 0);
|
||||
|
||||
void instrumentExitNode(BinaryContext &BC, BinaryBasicBlock &BB,
|
||||
BinaryBasicBlock::iterator Iter, bool IsLeaf,
|
||||
FunctionDescription &FuncDesc, uint32_t Node);
|
||||
|
||||
uint32_t getFDSize() const;
|
||||
/// Create a non-allocatable ELF section with read-only tables necessary for
|
||||
/// writing the instrumented data profile during program finish. The runtime
|
||||
/// library needs to open the program executable file and read this data from
|
||||
/// disk, this is not loaded by the system.
|
||||
void emitTablesAsELFNote(BinaryContext &BC);
|
||||
|
||||
/// Critical edges worklist
|
||||
/// This worklist keeps track of CFG edges <From-To> that needs to be split.
|
||||
/// This task is deferred until we finish processing all BBs because we can't
|
||||
/// modify the CFG while iterating over it. For each edge, \p SplitInstrs
|
||||
/// stores the list of instrumentation instructions as a vector of MCInsts.
|
||||
/// instrumentOneTarget() populates this, runOnFunctions() consumes.
|
||||
std::vector<std::pair<BinaryBasicBlock *, BinaryBasicBlock *>> SplitWorklist;
|
||||
std::vector<std::vector<MCInst>> SplitInstrs;
|
||||
|
||||
/// Stores function names, to be emitted to the runtime
|
||||
std::string StringTable;
|
||||
|
||||
/// strtab indices in StringTable for each function name
|
||||
std::unordered_map<const BinaryFunction *, uint32_t> FuncToStringIdx;
|
||||
std::vector<CounterDescription> Descriptions;
|
||||
/// Intra-function control flow
|
||||
std::vector<FunctionDescription> FunctionDescriptions;
|
||||
mutable std::shared_timed_mutex FDMutex;
|
||||
|
||||
/// Inter-function control flow
|
||||
std::vector<CallDescription> CallDescriptions;
|
||||
|
||||
/// Identify all counters used in runtime while instrumentation is running
|
||||
std::vector<MCSymbol *> Labels;
|
||||
std::vector<MCSymbol *> Counters;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -3098,9 +3098,13 @@ void RewriteInstance::emitAndLink() {
|
||||
if (EFMM->ObjectsLoaded) {
|
||||
auto Result = OLT->findSymbol(Name, false);
|
||||
if (cantFail(Result.getAddress()) == 0) {
|
||||
errs()
|
||||
<< "BOLT-ERROR: symbol not found required by runtime library: "
|
||||
<< Name << "\n";
|
||||
// Resolve to a PLT entry if possible
|
||||
if (auto *I = BC->getBinaryDataByName(Name + "@PLT"))
|
||||
return JITSymbol(I->getAddress(), JITSymbolFlags());
|
||||
|
||||
errs() << "BOLT-ERROR: symbol not found required by runtime "
|
||||
"library: "
|
||||
<< Name << "\n";
|
||||
exit(1);
|
||||
}
|
||||
return Result;
|
||||
|
Loading…
x
Reference in New Issue
Block a user