mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-02-17 16:31:02 +00:00
[BOLT] Fix handling of code references from unmodified code
In lite mode (default for X86), BOLT optimizes and relocates functions with profile. The rest of the code is preserved, but if it references relocated code such references have to be updated. The update is handled by scanExternalRefs() function. Note that we cannot solely rely on relocations written by the linker, as not all code references are exposed to the linker. Additionally, the linker can modify certain instructions and relocations will no longer match the code. With this change, start using symbolic disassembler for scanning code for references in scanExternalRefs(). Unlike the previous approach, the symbolizer properly detects and creates references for instructions with multiple/ambiguous symbolic operands and handles cases where a relocation doesn't match any operand. See test cases for examples. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D152631
This commit is contained in:
parent
85f1726c1b
commit
43f56a2f27
@ -2050,9 +2050,11 @@ public:
|
||||
void handleAArch64IndirectCall(MCInst &Instruction, const uint64_t Offset);
|
||||
|
||||
/// Scan function for references to other functions. In relocation mode,
|
||||
/// add relocations for external references.
|
||||
/// add relocations for external references. In non-relocation mode, detect
|
||||
/// and mark new entry points.
|
||||
///
|
||||
/// Return true on success.
|
||||
/// Return true on success. False if the disassembly failed or relocations
|
||||
/// could not be created.
|
||||
bool scanExternalRefs();
|
||||
|
||||
/// Return the size of a data object located at \p Offset in the function.
|
||||
|
@ -336,9 +336,12 @@ public:
|
||||
initSizeMap();
|
||||
}
|
||||
|
||||
/// Create and return target-specific MC symbolizer for the \p Function.
|
||||
/// Create and return a target-specific MC symbolizer for the \p Function.
|
||||
/// When \p CreateNewSymbols is set, the symbolizer can create new symbols
|
||||
/// e.g. for jump table references.
|
||||
virtual std::unique_ptr<MCSymbolizer>
|
||||
createTargetSymbolizer(BinaryFunction &Function) const {
|
||||
createTargetSymbolizer(BinaryFunction &Function,
|
||||
bool CreateNewSymbols = true) const {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -1412,6 +1412,11 @@ bool BinaryFunction::scanExternalRefs() {
|
||||
assert(FunctionData.size() == getMaxSize() &&
|
||||
"function size does not match raw data size");
|
||||
|
||||
BC.SymbolicDisAsm->setSymbolizer(
|
||||
BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
|
||||
|
||||
// Disassemble contents of the function. Detect code entry points and create
|
||||
// relocations for references to code that will be moved.
|
||||
uint64_t Size = 0; // instruction size
|
||||
for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
|
||||
// Check for data inside code and ignore it
|
||||
@ -1422,9 +1427,9 @@ bool BinaryFunction::scanExternalRefs() {
|
||||
|
||||
const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
|
||||
MCInst Instruction;
|
||||
if (!BC.DisAsm->getInstruction(Instruction, Size,
|
||||
FunctionData.slice(Offset),
|
||||
AbsoluteInstrAddr, nulls())) {
|
||||
if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size,
|
||||
FunctionData.slice(Offset),
|
||||
AbsoluteInstrAddr, nulls())) {
|
||||
if (opts::Verbosity >= 1 && !isZeroPaddingAt(Offset)) {
|
||||
errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
|
||||
<< Twine::utohexstr(Offset) << " (address 0x"
|
||||
@ -1465,74 +1470,41 @@ bool BinaryFunction::scanExternalRefs() {
|
||||
return ignoreFunctionRef(*TargetFunction);
|
||||
};
|
||||
|
||||
// Detect if the instruction references an address.
|
||||
// Without relocations, we can only trust PC-relative address modes.
|
||||
uint64_t TargetAddress = 0;
|
||||
bool IsPCRel = false;
|
||||
bool IsBranch = false;
|
||||
if (BC.MIB->hasPCRelOperand(Instruction)) {
|
||||
IsPCRel = BC.MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
|
||||
AbsoluteInstrAddr, Size);
|
||||
} else if (BC.MIB->isCall(Instruction) || BC.MIB->isBranch(Instruction)) {
|
||||
IsBranch = BC.MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size,
|
||||
TargetAddress);
|
||||
}
|
||||
// Handle calls and branches separately as symbolization doesn't work for
|
||||
// them yet.
|
||||
MCSymbol *BranchTargetSymbol = nullptr;
|
||||
if (BC.MIB->isCall(Instruction) || BC.MIB->isBranch(Instruction)) {
|
||||
uint64_t TargetAddress = 0;
|
||||
BC.MIB->evaluateBranch(Instruction, AbsoluteInstrAddr, Size,
|
||||
TargetAddress);
|
||||
|
||||
MCSymbol *TargetSymbol = nullptr;
|
||||
// Create an entry point at reference address if needed.
|
||||
BinaryFunction *TargetFunction =
|
||||
BC.getBinaryFunctionContainingAddress(TargetAddress);
|
||||
|
||||
if (!TargetFunction || ignoreFunctionRef(*TargetFunction))
|
||||
continue;
|
||||
|
||||
// Create an entry point at reference address if needed.
|
||||
BinaryFunction *TargetFunction =
|
||||
BC.getBinaryFunctionContainingAddress(TargetAddress);
|
||||
if (TargetFunction && !ignoreFunctionRef(*TargetFunction)) {
|
||||
const uint64_t FunctionOffset =
|
||||
TargetAddress - TargetFunction->getAddress();
|
||||
TargetSymbol = FunctionOffset
|
||||
? TargetFunction->addEntryPointAtOffset(FunctionOffset)
|
||||
BranchTargetSymbol =
|
||||
FunctionOffset ? TargetFunction->addEntryPointAtOffset(FunctionOffset)
|
||||
: TargetFunction->getSymbol();
|
||||
}
|
||||
|
||||
// Can't find more references and not creating relocations.
|
||||
// Can't find more references. Not creating relocations since we are not
|
||||
// moving code.
|
||||
if (!BC.HasRelocations)
|
||||
continue;
|
||||
|
||||
// Create a relocation against the TargetSymbol as the symbol might get
|
||||
// moved.
|
||||
if (TargetSymbol) {
|
||||
if (IsBranch) {
|
||||
BC.MIB->replaceBranchTarget(Instruction, TargetSymbol,
|
||||
Emitter.LocalCtx.get());
|
||||
} else if (IsPCRel) {
|
||||
const MCExpr *Expr = MCSymbolRefExpr::create(
|
||||
TargetSymbol, MCSymbolRefExpr::VK_None, *Emitter.LocalCtx.get());
|
||||
BC.MIB->replaceMemOperandDisp(
|
||||
Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor(
|
||||
Instruction, Expr, *Emitter.LocalCtx.get(), 0)));
|
||||
}
|
||||
}
|
||||
|
||||
// Create more relocations based on input file relocations.
|
||||
bool HasRel = false;
|
||||
for (auto Itr = Relocations.lower_bound(Offset),
|
||||
ItrE = Relocations.lower_bound(Offset + Size);
|
||||
Itr != ItrE; ++Itr) {
|
||||
Relocation &Relocation = Itr->second;
|
||||
if (Relocation.isPCRelative() && BC.isX86())
|
||||
continue;
|
||||
if (ignoreReference(Relocation.Symbol))
|
||||
continue;
|
||||
|
||||
int64_t Value = Relocation.Value;
|
||||
const bool Result = BC.MIB->replaceImmWithSymbolRef(
|
||||
Instruction, Relocation.Symbol, Relocation.Addend,
|
||||
Emitter.LocalCtx.get(), Value, Relocation.Type);
|
||||
(void)Result;
|
||||
assert(Result && "cannot replace immediate with relocation");
|
||||
|
||||
HasRel = true;
|
||||
}
|
||||
|
||||
if (!TargetSymbol && !HasRel)
|
||||
if (BranchTargetSymbol) {
|
||||
BC.MIB->replaceBranchTarget(Instruction, BranchTargetSymbol,
|
||||
Emitter.LocalCtx.get());
|
||||
} else if (!llvm::any_of(Instruction,
|
||||
[](const MCOperand &Op) { return Op.isExpr(); })) {
|
||||
// Skip assembly if the instruction may not have any symbolic operands.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Emit the instruction using temp emitter and generate relocations.
|
||||
SmallString<256> Code;
|
||||
@ -1547,6 +1519,9 @@ bool BinaryFunction::scanExternalRefs() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ignoreReference(Rel->Symbol))
|
||||
continue;
|
||||
|
||||
if (Relocation::getSizeForType(Rel->Type) < 4) {
|
||||
// If the instruction uses a short form, then we might not be able
|
||||
// to handle the rewrite without relaxation, and hence cannot reliably
|
||||
@ -1562,6 +1537,9 @@ bool BinaryFunction::scanExternalRefs() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Reset symbolizer for the disassembler.
|
||||
BC.SymbolicDisAsm->setSymbolizer(nullptr);
|
||||
|
||||
// Add relocations unless disassembly failed for this function.
|
||||
if (!DisassemblyFailed)
|
||||
for (Relocation &Rel : FunctionRelocations)
|
||||
|
@ -73,8 +73,9 @@ public:
|
||||
: MCPlusBuilder(Analysis, Info, RegInfo) {}
|
||||
|
||||
std::unique_ptr<MCSymbolizer>
|
||||
createTargetSymbolizer(BinaryFunction &Function) const override {
|
||||
return std::make_unique<X86MCSymbolizer>(Function);
|
||||
createTargetSymbolizer(BinaryFunction &Function,
|
||||
bool CreateNewSymbols) const override {
|
||||
return std::make_unique<X86MCSymbolizer>(Function, CreateNewSymbols);
|
||||
}
|
||||
|
||||
bool isBranch(const MCInst &Inst) const override {
|
||||
@ -2462,6 +2463,8 @@ public:
|
||||
//
|
||||
// Only the following limited expression types are supported:
|
||||
// Symbol + Addend
|
||||
// Symbol + Constant + Addend
|
||||
// Const + Addend
|
||||
// Symbol
|
||||
uint64_t Addend = 0;
|
||||
MCSymbol *Symbol = nullptr;
|
||||
@ -2471,11 +2474,25 @@ public:
|
||||
assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add &&
|
||||
"unexpected binary expression");
|
||||
const MCExpr *LHS = BinaryExpr->getLHS();
|
||||
assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS");
|
||||
Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LHS));
|
||||
if (LHS->getKind() == MCExpr::Constant) {
|
||||
Addend = cast<MCConstantExpr>(LHS)->getValue();
|
||||
} else if (LHS->getKind() == MCExpr::Binary) {
|
||||
const auto *LHSBinaryExpr = cast<MCBinaryExpr>(LHS);
|
||||
assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add &&
|
||||
"unexpected binary expression");
|
||||
const MCExpr *LLHS = LHSBinaryExpr->getLHS();
|
||||
assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS");
|
||||
Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LLHS));
|
||||
const MCExpr *RLHS = LHSBinaryExpr->getRHS();
|
||||
assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS");
|
||||
Addend = cast<MCConstantExpr>(RLHS)->getValue();
|
||||
} else {
|
||||
assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS");
|
||||
Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LHS));
|
||||
}
|
||||
const MCExpr *RHS = BinaryExpr->getRHS();
|
||||
assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS");
|
||||
Addend = cast<MCConstantExpr>(RHS)->getValue();
|
||||
Addend += cast<MCConstantExpr>(RHS)->getValue();
|
||||
} else {
|
||||
assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value");
|
||||
Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(ValueExpr));
|
||||
|
@ -70,8 +70,18 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
|
||||
|
||||
const MCSymbol *TargetSymbol;
|
||||
uint64_t TargetOffset;
|
||||
std::tie(TargetSymbol, TargetOffset) =
|
||||
BC.handleAddressRef(Value, Function, /*IsPCRel=*/true);
|
||||
|
||||
if (!CreateNewSymbols) {
|
||||
if (BinaryData *BD = BC.getBinaryDataContainingAddress(Value)) {
|
||||
TargetSymbol = BD->getSymbol();
|
||||
TargetOffset = Value - BD->getAddress();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
std::tie(TargetSymbol, TargetOffset) =
|
||||
BC.handleAddressRef(Value, Function, /*IsPCRel=*/true);
|
||||
}
|
||||
|
||||
addOperand(TargetSymbol, TargetOffset);
|
||||
|
||||
@ -98,10 +108,15 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
|
||||
|
||||
// The linker converted the PC-relative address to an absolute one.
|
||||
// Symbolize this address.
|
||||
BC.handleAddressRef(Value, Function, /*IsPCRel=*/false);
|
||||
if (CreateNewSymbols)
|
||||
BC.handleAddressRef(Value, Function, /*IsPCRel=*/false);
|
||||
|
||||
const BinaryData *Target = BC.getBinaryDataAtAddress(Value);
|
||||
assert(Target &&
|
||||
"BinaryData should exist at converted GOTPCRELX destination");
|
||||
if (!Target) {
|
||||
assert(!CreateNewSymbols &&
|
||||
"BinaryData should exist at converted GOTPCRELX destination");
|
||||
return false;
|
||||
}
|
||||
|
||||
addOperand(Target->getSymbol(), /*Addend=*/0);
|
||||
|
||||
@ -120,7 +135,8 @@ bool X86MCSymbolizer::tryAddingSymbolicOperand(
|
||||
SymbolValue += InstAddress + ImmOffset;
|
||||
|
||||
// Process reference to the symbol.
|
||||
BC.handleAddressRef(SymbolValue, Function, Relocation->isPCRelative());
|
||||
if (CreateNewSymbols)
|
||||
BC.handleAddressRef(SymbolValue, Function, Relocation->isPCRelative());
|
||||
|
||||
uint64_t Addend = Relocation->Addend;
|
||||
// Real addend for pc-relative targets is adjusted with a delta from
|
||||
|
@ -18,11 +18,12 @@ namespace bolt {
|
||||
class X86MCSymbolizer : public MCSymbolizer {
|
||||
protected:
|
||||
BinaryFunction &Function;
|
||||
bool CreateNewSymbols{true};
|
||||
|
||||
public:
|
||||
X86MCSymbolizer(BinaryFunction &Function)
|
||||
X86MCSymbolizer(BinaryFunction &Function, bool CreateNewSymbols = true)
|
||||
: MCSymbolizer(*Function.getBinaryContext().Ctx.get(), nullptr),
|
||||
Function(Function) {}
|
||||
Function(Function), CreateNewSymbols(CreateNewSymbols) {}
|
||||
|
||||
X86MCSymbolizer(const X86MCSymbolizer &) = delete;
|
||||
X86MCSymbolizer &operator=(const X86MCSymbolizer &) = delete;
|
||||
|
67
bolt/test/X86/double-rel-scan.s
Normal file
67
bolt/test/X86/double-rel-scan.s
Normal file
@ -0,0 +1,67 @@
|
||||
## Check that BOLT can correctly use relocations to symbolize instruction
|
||||
## operands when an instruction can have up to two relocations associated
|
||||
## with it. The test checks that the update happens in the function that
|
||||
## is not being optimized/relocated by BOLT.
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
|
||||
# RUN: ld.lld %t.o -o %t.exe -q --Ttext=0x80000
|
||||
# RUN: llvm-bolt %t.exe --relocs -o %t.bolt --funcs=foo
|
||||
# RUN: llvm-objdump -d --print-imm-hex %t.exe \
|
||||
# RUN: | FileCheck %s
|
||||
# RUN: llvm-objdump -d --print-imm-hex %t.bolt \
|
||||
# RUN: | FileCheck %s --check-prefix=CHECK-POST-BOLT
|
||||
|
||||
.text
|
||||
.globl foo
|
||||
.type foo,@function
|
||||
foo:
|
||||
.cfi_startproc
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size foo, .-foo
|
||||
|
||||
|
||||
.text
|
||||
.globl _start
|
||||
.type _start,@function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
|
||||
## All four MOV instructions below are identical in the input binary, but
|
||||
## different from each other after BOLT.
|
||||
##
|
||||
## foo value is 0x80000 pre-BOLT. Using relocations, llvm-bolt should correctly
|
||||
## symbolize 0x80000 instruction operand and differentiate between an immediate
|
||||
## constant 0x80000 and a reference to foo. When foo is relocated, BOLT should
|
||||
## update references even from the code that is not being updated.
|
||||
|
||||
# CHECK: 80000 <foo>:
|
||||
|
||||
# CHECK: <_start>:
|
||||
# CHECK-POST-BOLT: <_start>:
|
||||
|
||||
movq $0x80000, 0x80000
|
||||
# CHECK-NEXT: movq $0x80000, 0x80000
|
||||
# CHECK-POST-BOLT-NEXT: movq $0x80000, 0x80000
|
||||
|
||||
movq $foo, 0x80000
|
||||
# CHECK-NEXT: movq $0x80000, 0x80000
|
||||
# CHECK-POST-BOLT-NEXT: movq $0x[[#%x,ADDR:]], 0x80000
|
||||
|
||||
movq $0x80000, foo
|
||||
# CHECK-NEXT: movq $0x80000, 0x80000
|
||||
# CHECK-POST-BOLT-NEXT: movq $0x80000, 0x[[#ADDR]]
|
||||
|
||||
movq $foo, foo
|
||||
# CHECK-NEXT: movq $0x80000, 0x80000
|
||||
# CHECK-POST-BOLT-NEXT: movq $0x[[#ADDR]], 0x[[#ADDR]]
|
||||
|
||||
## After BOLT, foo is relocated after _start.
|
||||
|
||||
# CHECK-POST-BOLT: [[#ADDR]] <foo>:
|
||||
|
||||
retq
|
||||
.size _start, .-_start
|
||||
.cfi_endproc
|
@ -25,6 +25,10 @@ _start:
|
||||
## VAR value is 0x80000. Using relocations, llvm-bolt should correctly
|
||||
## symbolize the instruction operands.
|
||||
|
||||
movq $0x80000, 0x80000
|
||||
# CHECK-BOLT: movq $0x80000, 0x80000
|
||||
# CHECK-OBJDUMP: movq $0x80000, 0x80000
|
||||
|
||||
movq $VAR, 0x80000
|
||||
# CHECK-BOLT: movq $VAR, 0x80000
|
||||
# CHECK-OBJDUMP: movq $0x80000, 0x80000
|
||||
|
@ -19,14 +19,19 @@
|
||||
# RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex \
|
||||
# RUN: %t.out | FileCheck --check-prefix=DISASM %s
|
||||
|
||||
## Relocate foo only and check that code references from _start (that is
|
||||
## otherwise preserved) are updated.
|
||||
|
||||
# RUN: llvm-bolt %t.exe --relocs -o %t.lite.out --funcs=foo
|
||||
# RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex \
|
||||
# RUN: %t.lite.out | FileCheck --check-prefix=DISASM %s
|
||||
|
||||
.text
|
||||
.globl _start
|
||||
.type _start, %function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
# DISASM: Disassembly of section .text:
|
||||
# DISASM-EMPTY:
|
||||
# DISASM-NEXT: <_start>:
|
||||
# DISASM: <_start>:
|
||||
|
||||
call *foo@GOTPCREL(%rip)
|
||||
# NO-RELAX-BOLT: callq *{{.*}}(%rip)
|
||||
@ -64,6 +69,8 @@ _start:
|
||||
# PIE-BOLT-NEXT: jmp foo
|
||||
# DISASM-NEXT: jmp 0x[[#ADDR]]
|
||||
|
||||
# DISASM: [[#ADDR]] <foo>:
|
||||
|
||||
ret
|
||||
.cfi_endproc
|
||||
.size _start, .-_start
|
||||
|
Loading…
x
Reference in New Issue
Block a user