mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-12-15 04:00:56 +00:00
[BOLT] Add new class for symbolizing X86 instructions
Summary: While disassembling instructions, we need to replace certain immediate operands with symbols. This symbolizing process relies on reading relocations against instructions. However, some X86 instructions can have multiple immediate operands and up to two relocations against them. Thus, correctly matching a relocation to an operand is not always possible without knowing the operand offset within the instruction. Luckily, LLVM provides an interface for passing the required info from the disassembler via a virtual MCSymbolizer class. Creating a target-specific version allows a precise matching of relocations to operands. This diff adds X86MCSymbolizer class that performs X86-specific symbolizing (currently limited to non-branch instructions). Reviewers: yota9, Amir, ayermolo, rafauler, zr33 Differential Revision: https://reviews.llvm.org/D120928
This commit is contained in:
parent
79e3d57f52
commit
e290133c76
@ -556,6 +556,9 @@ public:
|
||||
|
||||
std::unique_ptr<MCDisassembler> DisAsm;
|
||||
|
||||
/// Symbolic disassembler.
|
||||
std::unique_ptr<MCDisassembler> SymbolicDisAsm;
|
||||
|
||||
std::unique_ptr<MCAsmBackend> MAB;
|
||||
|
||||
/// Indicates if relocations are available for usage.
|
||||
|
@ -833,6 +833,15 @@ public:
|
||||
return make_range(JumpTables.begin(), JumpTables.end());
|
||||
}
|
||||
|
||||
/// Return relocation associated with a given \p Offset in the function,
|
||||
/// or nullptr if no such relocation exists.
|
||||
const Relocation *getRelocationAt(uint64_t Offset) const {
|
||||
assert(CurrentState == State::Empty &&
|
||||
"Relocations unavailable in the current function state.");
|
||||
auto RI = Relocations.find(Offset);
|
||||
return (RI == Relocations.end()) ? nullptr : &RI->second;
|
||||
}
|
||||
|
||||
/// Returns the raw binary encoding of this function.
|
||||
ErrorOr<ArrayRef<uint8_t>> getData() const;
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/StringMap.h"
|
||||
#include "llvm/MC/MCAsmBackend.h"
|
||||
#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
|
||||
#include "llvm/MC/MCExpr.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCInstrAnalysis.h"
|
||||
@ -44,6 +45,7 @@ class MCSymbol;
|
||||
class raw_ostream;
|
||||
|
||||
namespace bolt {
|
||||
class BinaryFunction;
|
||||
|
||||
/// Different types of indirect branches encountered during disassembly.
|
||||
enum class IndirectBranchType : char {
|
||||
@ -286,6 +288,12 @@ public:
|
||||
initAliases();
|
||||
}
|
||||
|
||||
/// Create and return target-specific MC symbolizer for the \p Function.
|
||||
virtual std::unique_ptr<MCSymbolizer>
|
||||
createTargetSymbolizer(BinaryFunction &Function) const {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// Initialize a new annotation allocator and return its id
|
||||
AllocatorIdTy initializeNewAnnotationAllocator() {
|
||||
AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());
|
||||
|
@ -251,6 +251,14 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
|
||||
|
||||
BC->HasFixedLoadAddress = !IsPIC;
|
||||
|
||||
BC->SymbolicDisAsm = std::unique_ptr<MCDisassembler>(
|
||||
BC->TheTarget->createMCDisassembler(*BC->STI, *BC->Ctx));
|
||||
|
||||
if (!BC->SymbolicDisAsm)
|
||||
return createStringError(
|
||||
make_error_code(std::errc::not_supported),
|
||||
Twine("BOLT-ERROR: no disassembler info for target ", TripleName));
|
||||
|
||||
return std::move(BC);
|
||||
}
|
||||
|
||||
|
@ -1028,6 +1028,8 @@ bool BinaryFunction::disassemble() {
|
||||
auto &Ctx = BC.Ctx;
|
||||
auto &MIB = BC.MIB;
|
||||
|
||||
BC.SymbolicDisAsm->setSymbolizer(MIB->createTargetSymbolizer(*this));
|
||||
|
||||
// Insert a label at the beginning of the function. This will be our first
|
||||
// basic block.
|
||||
Labels[0] = Ctx->createNamedTempSymbol("BB0");
|
||||
@ -1201,9 +1203,9 @@ bool BinaryFunction::disassemble() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!BC.DisAsm->getInstruction(Instruction, Size,
|
||||
FunctionData.slice(Offset),
|
||||
AbsoluteInstrAddr, nulls())) {
|
||||
if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size,
|
||||
FunctionData.slice(Offset),
|
||||
AbsoluteInstrAddr, nulls())) {
|
||||
// Functions with "soft" boundaries, e.g. coming from assembly source,
|
||||
// can have 0-byte padding at the end.
|
||||
if (isZeroPaddingAt(Offset))
|
||||
@ -1243,12 +1245,16 @@ bool BinaryFunction::disassemble() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Check if our disassembly is correct and matches the assembler output.
|
||||
if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) {
|
||||
if (opts::Verbosity >= 1) {
|
||||
// Disassemble again without the symbolizer and check that the disassembly
|
||||
// matches the assembler output.
|
||||
MCInst TempInst;
|
||||
BC.DisAsm->getInstruction(TempInst, Size, FunctionData.slice(Offset),
|
||||
AbsoluteInstrAddr, nulls());
|
||||
if (!BC.validateEncoding(TempInst, FunctionData.slice(Offset, Size))) {
|
||||
if (opts::Verbosity >= 0) {
|
||||
errs() << "BOLT-WARNING: internal assembler/disassembler error "
|
||||
"detected for AVX512 instruction:\n";
|
||||
BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
|
||||
BC.printInstruction(errs(), TempInst, AbsoluteInstrAddr);
|
||||
errs() << " in function " << *this << '\n';
|
||||
}
|
||||
|
||||
@ -1341,7 +1347,7 @@ bool BinaryFunction::disassemble() {
|
||||
if (BC.isAArch64())
|
||||
handleAArch64IndirectCall(Instruction, Offset);
|
||||
}
|
||||
} else {
|
||||
} else if (BC.isAArch64()) {
|
||||
// Check if there's a relocation associated with this instruction.
|
||||
bool UsedReloc = false;
|
||||
for (auto Itr = Relocations.lower_bound(Offset),
|
||||
@ -1352,60 +1358,17 @@ bool BinaryFunction::disassemble() {
|
||||
if (Relocation.isPCRelative())
|
||||
SymbolValue += getAddress() + Relocation.Offset;
|
||||
|
||||
// Process reference to the symbol.
|
||||
if (BC.isX86())
|
||||
BC.handleAddressRef(SymbolValue, *this, Relocation.isPCRelative());
|
||||
int64_t Value = Relocation.Value;
|
||||
const bool Result = BC.MIB->replaceImmWithSymbolRef(
|
||||
Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value,
|
||||
Relocation.Type);
|
||||
(void)Result;
|
||||
assert(Result && "cannot replace immediate with relocation");
|
||||
|
||||
if (BC.isAArch64() || !Relocation.isPCRelative()) {
|
||||
int64_t Value = Relocation.Value;
|
||||
const bool Result = BC.MIB->replaceImmWithSymbolRef(
|
||||
Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(),
|
||||
Value, Relocation.Type);
|
||||
(void)Result;
|
||||
assert(Result && "cannot replace immediate with relocation");
|
||||
|
||||
if (BC.isX86()) {
|
||||
// Make sure we replaced the correct immediate (instruction
|
||||
// can have multiple immediate operands).
|
||||
assert(
|
||||
truncateToSize(static_cast<uint64_t>(Value),
|
||||
Relocation::getSizeForType(Relocation.Type)) ==
|
||||
truncateToSize(Relocation.Value, Relocation::getSizeForType(
|
||||
Relocation.Type)) &&
|
||||
"immediate value mismatch in function");
|
||||
} else if (BC.isAArch64()) {
|
||||
// For aarch, if we replaced an immediate with a symbol from a
|
||||
// relocation, we mark it so we do not try to further process a
|
||||
// pc-relative operand. All we need is the symbol.
|
||||
UsedReloc = true;
|
||||
}
|
||||
} else {
|
||||
// Check if the relocation matches memop's Disp.
|
||||
uint64_t TargetAddress;
|
||||
if (!BC.MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
|
||||
AbsoluteInstrAddr, Size)) {
|
||||
errs() << "BOLT-ERROR: PC-relative operand can't be evaluated\n";
|
||||
exit(1);
|
||||
}
|
||||
assert(TargetAddress == Relocation.Value + AbsoluteInstrAddr + Size &&
|
||||
"Immediate value mismatch detected.");
|
||||
|
||||
const MCExpr *Expr = MCSymbolRefExpr::create(
|
||||
Relocation.Symbol, MCSymbolRefExpr::VK_None, *BC.Ctx);
|
||||
// Real addend for pc-relative targets is adjusted with a delta
|
||||
// from relocation placement to the next instruction.
|
||||
const uint64_t TargetAddend =
|
||||
Relocation.Addend + Offset + Size - Relocation.Offset;
|
||||
if (TargetAddend) {
|
||||
const MCConstantExpr *Offset =
|
||||
MCConstantExpr::create(TargetAddend, *BC.Ctx);
|
||||
Expr = MCBinaryExpr::createAdd(Expr, Offset, *BC.Ctx);
|
||||
}
|
||||
BC.MIB->replaceMemOperandDisp(
|
||||
Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor(
|
||||
Instruction, Expr, *BC.Ctx, 0)));
|
||||
UsedReloc = true;
|
||||
}
|
||||
// For aarch64, if we replaced an immediate with a symbol from a
|
||||
// relocation, we mark it so we do not try to further process a
|
||||
// pc-relative operand. All we need is the symbol.
|
||||
UsedReloc = true;
|
||||
}
|
||||
|
||||
if (MIB->hasPCRelOperand(Instruction) && !UsedReloc)
|
||||
@ -1432,6 +1395,9 @@ add_instruction:
|
||||
addInstruction(Offset, std::move(Instruction));
|
||||
}
|
||||
|
||||
// Reset symbolizer for the disassembler.
|
||||
BC.SymbolicDisAsm->setSymbolizer(nullptr);
|
||||
|
||||
clearList(Relocations);
|
||||
|
||||
if (!IsSimple) {
|
||||
|
@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS
|
||||
DebugInfoDWARF
|
||||
Demangle
|
||||
MC
|
||||
MCDisassembler
|
||||
Object
|
||||
Support
|
||||
)
|
||||
|
@ -2,12 +2,14 @@ set(LLVM_LINK_COMPONENTS
|
||||
BOLTCore
|
||||
BOLTUtils
|
||||
MC
|
||||
MCDisassembler
|
||||
Support
|
||||
X86Desc
|
||||
)
|
||||
|
||||
add_llvm_library(LLVMBOLTTargetX86
|
||||
X86MCPlusBuilder.cpp
|
||||
X86MCSymbolizer.cpp
|
||||
|
||||
DEPENDS
|
||||
X86CommonTableGen
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "MCTargetDesc/X86BaseInfo.h"
|
||||
#include "MCTargetDesc/X86InstrRelaxTables.h"
|
||||
#include "MCTargetDesc/X86MCTargetDesc.h"
|
||||
#include "X86MCSymbolizer.h"
|
||||
#include "bolt/Core/MCPlus.h"
|
||||
#include "bolt/Core/MCPlusBuilder.h"
|
||||
#include "llvm/BinaryFormat/ELF.h"
|
||||
@ -81,6 +82,11 @@ public:
|
||||
const MCRegisterInfo *RegInfo)
|
||||
: MCPlusBuilder(Analysis, Info, RegInfo) {}
|
||||
|
||||
std::unique_ptr<MCSymbolizer>
|
||||
createTargetSymbolizer(BinaryFunction &Function) const override {
|
||||
return std::make_unique<X86MCSymbolizer>(Function);
|
||||
}
|
||||
|
||||
bool isBranch(const MCInst &Inst) const override {
|
||||
return Analysis->isBranch(Inst) && !isTailCall(Inst);
|
||||
}
|
||||
|
107
bolt/lib/Target/X86/X86MCSymbolizer.cpp
Normal file
107
bolt/lib/Target/X86/X86MCSymbolizer.cpp
Normal file
@ -0,0 +1,107 @@
|
||||
//===- bolt/Target/X86/X86MCSymbolizer.cpp --------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "X86MCSymbolizer.h"
|
||||
#include "MCTargetDesc/X86BaseInfo.h"
|
||||
#include "bolt/Core/BinaryContext.h"
|
||||
#include "bolt/Core/BinaryFunction.h"
|
||||
#include "bolt/Core/MCPlusBuilder.h"
|
||||
#include "bolt/Core/Relocation.h"
|
||||
#include "llvm/MC/MCInst.h"
|
||||
#include "llvm/MC/MCRegisterInfo.h"
|
||||
|
||||
#define DEBUG_TYPE "bolt-symbolizer"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
X86MCSymbolizer::~X86MCSymbolizer() {}
|
||||
|
||||
bool X86MCSymbolizer::tryAddingSymbolicOperand(
|
||||
MCInst &Inst, raw_ostream &CStream, int64_t Value, uint64_t InstAddress,
|
||||
bool IsBranch, uint64_t ImmOffset, uint64_t ImmSize, uint64_t InstSize) {
|
||||
if (IsBranch)
|
||||
return false;
|
||||
|
||||
// Ignore implicit operands.
|
||||
if (ImmSize == 0)
|
||||
return false;
|
||||
|
||||
BinaryContext &BC = Function.getBinaryContext();
|
||||
MCContext *Ctx = BC.Ctx.get();
|
||||
|
||||
if (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst))
|
||||
return false;
|
||||
|
||||
/// Add symbolic operand to the instruction with an optional addend.
|
||||
auto addOperand = [&](const MCSymbol *Symbol, uint64_t Addend) {
|
||||
const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, *Ctx);
|
||||
if (Addend)
|
||||
Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Addend, *Ctx),
|
||||
*Ctx);
|
||||
Inst.addOperand(MCOperand::createExpr(Expr));
|
||||
};
|
||||
|
||||
// Check for relocations against the operand.
|
||||
const uint64_t InstOffset = InstAddress - Function.getAddress();
|
||||
if (const Relocation *Relocation =
|
||||
Function.getRelocationAt(InstOffset + ImmOffset)) {
|
||||
uint64_t SymbolValue = Relocation->Value - Relocation->Addend;
|
||||
if (Relocation->isPCRelative())
|
||||
SymbolValue += InstAddress + ImmOffset;
|
||||
|
||||
// Process reference to the symbol.
|
||||
BC.handleAddressRef(SymbolValue, Function, Relocation->isPCRelative());
|
||||
|
||||
uint64_t Addend = Relocation->Addend;
|
||||
// Real addend for pc-relative targets is adjusted with a delta from
|
||||
// the relocation placement to the next instruction.
|
||||
if (Relocation->isPCRelative())
|
||||
Addend += InstOffset + InstSize - Relocation->Offset;
|
||||
|
||||
addOperand(Relocation->Symbol, Addend);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if the operand being added is a displacement part of a compound
|
||||
// memory operand that uses PC-relative addressing. If it is, try to symbolize
|
||||
// it without relocations.
|
||||
const int MemOp = BC.MIB->getMemoryOperandNo(Inst);
|
||||
if (MemOp == -1)
|
||||
return false;
|
||||
|
||||
const unsigned DispOp = MemOp + X86::AddrDisp;
|
||||
if (Inst.getNumOperands() != DispOp)
|
||||
return false;
|
||||
|
||||
const MCOperand &Base = Inst.getOperand(MemOp + X86::AddrBaseReg);
|
||||
if (Base.getReg() != BC.MRI->getProgramCounter())
|
||||
return false;
|
||||
|
||||
const MCOperand &Scale = Inst.getOperand(MemOp + X86::AddrScaleAmt);
|
||||
const MCOperand &Index = Inst.getOperand(MemOp + X86::AddrIndexReg);
|
||||
if (Scale.getImm() != 0 && Index.getReg() != MCRegister::NoRegister)
|
||||
return false;
|
||||
|
||||
const MCSymbol *TargetSymbol;
|
||||
uint64_t TargetOffset;
|
||||
std::tie(TargetSymbol, TargetOffset) =
|
||||
BC.handleAddressRef(Value, Function, /*IsPCRel*/ true);
|
||||
|
||||
addOperand(TargetSymbol, TargetOffset);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void X86MCSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &CStream,
|
||||
int64_t Value,
|
||||
uint64_t Address) {}
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
43
bolt/lib/Target/X86/X86MCSymbolizer.h
Normal file
43
bolt/lib/Target/X86/X86MCSymbolizer.h
Normal file
@ -0,0 +1,43 @@
|
||||
//===- bolt/Target/X86/X86MCSymbolizer.h ------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef BOLT_CORE_X86MCSYMBOLIZER_H
|
||||
#define BOLT_CORE_X86MCSYMBOLIZER_H
|
||||
|
||||
#include "bolt/Core/BinaryFunction.h"
|
||||
#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
|
||||
|
||||
namespace llvm {
|
||||
namespace bolt {
|
||||
|
||||
class X86MCSymbolizer : public MCSymbolizer {
|
||||
protected:
|
||||
BinaryFunction &Function;
|
||||
|
||||
public:
|
||||
X86MCSymbolizer(BinaryFunction &Function)
|
||||
: MCSymbolizer(*Function.getBinaryContext().Ctx.get(), nullptr),
|
||||
Function(Function) {}
|
||||
|
||||
X86MCSymbolizer(const X86MCSymbolizer &) = delete;
|
||||
X86MCSymbolizer &operator=(const X86MCSymbolizer &) = delete;
|
||||
virtual ~X86MCSymbolizer();
|
||||
|
||||
bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &CStream,
|
||||
int64_t Value, uint64_t Address, bool IsBranch,
|
||||
uint64_t Offset, uint64_t OpSize,
|
||||
uint64_t InstSize) override;
|
||||
|
||||
void tryAddingPcLoadReferenceComment(raw_ostream &CStream, int64_t Value,
|
||||
uint64_t Address) override;
|
||||
};
|
||||
|
||||
} // namespace bolt
|
||||
} // namespace llvm
|
||||
|
||||
#endif
|
42
bolt/test/X86/double-rel.s
Normal file
42
bolt/test/X86/double-rel.s
Normal file
@ -0,0 +1,42 @@
|
||||
## Check that BOLT can correctly use relocations to symbolize instruction
|
||||
## operands when an instruction can have up to two relocations associated
|
||||
## with it.
|
||||
|
||||
# REQUIRES: system-linux
|
||||
|
||||
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
|
||||
# RUN: ld.lld %t.o -o %t.exe -q --Tdata=0x80000
|
||||
# RUN: llvm-bolt %t.exe -relocs -o /dev/null -print-only=_start -print-disasm \
|
||||
# RUN: | FileCheck %s --check-prefix=CHECK-BOLT
|
||||
# RUN: llvm-objdump -d --print-imm-hex %t.exe \
|
||||
# RUN: | FileCheck %s --check-prefix=CHECK-OBJDUMP
|
||||
|
||||
.data
|
||||
.globl VAR
|
||||
VAR:
|
||||
.quad
|
||||
|
||||
.text
|
||||
.globl _start
|
||||
.type _start,@function
|
||||
_start:
|
||||
.cfi_startproc
|
||||
|
||||
## VAR value is 0x80000. Using relocations, llvm-bolt should correctly
|
||||
## symbolize the instruction operands.
|
||||
|
||||
movq $VAR, 0x80000
|
||||
# CHECK-BOLT: movq $VAR, 0x80000
|
||||
# CHECK-OBJDUMP: movq $0x80000, 0x80000
|
||||
|
||||
movq $0x80000, VAR
|
||||
# CHECK-BOLT-NEXT: movq $0x80000, VAR
|
||||
# CHECK-OBJDUMP-NEXT: movq $0x80000, 0x80000
|
||||
|
||||
movq $VAR, VAR
|
||||
# CHECK-BOLT-NEXT: movq $VAR, VAR
|
||||
# CHECK-OBJDUMP-NEXT: movq $0x80000, 0x80000
|
||||
|
||||
retq
|
||||
.size _start, .-_start
|
||||
.cfi_endproc
|
Loading…
Reference in New Issue
Block a user