[BOLT] Add new class for symbolizing X86 instructions

Summary:
While disassembling instructions, we need to replace certain immediate
operands with symbols. This symbolizing process relies on reading
relocations against instructions. However, some X86 instructions can
have multiple immediate operands and up to two relocations against
them. Thus, correctly matching a relocation to an operand is not
always possible without knowing the operand offset within the
instruction.

Luckily, LLVM provides an interface for passing the required info from
the disassembler via a virtual MCSymbolizer class. Creating a
target-specific version allows a precise matching of relocations to
operands.

This diff adds X86MCSymbolizer class that performs X86-specific
symbolizing (currently limited to non-branch instructions).

Reviewers: yota9, Amir, ayermolo, rafauler, zr33

Differential Revision: https://reviews.llvm.org/D120928
This commit is contained in:
Maksim Panchenko 2022-02-22 19:06:25 -08:00
parent 79e3d57f52
commit e290133c76
11 changed files with 256 additions and 61 deletions

View File

@ -556,6 +556,9 @@ public:
std::unique_ptr<MCDisassembler> DisAsm;
/// Symbolic disassembler.
std::unique_ptr<MCDisassembler> SymbolicDisAsm;
std::unique_ptr<MCAsmBackend> MAB;
/// Indicates if relocations are available for usage.

View File

@ -833,6 +833,15 @@ public:
return make_range(JumpTables.begin(), JumpTables.end());
}
/// Return relocation associated with a given \p Offset in the function,
/// or nullptr if no such relocation exists.
const Relocation *getRelocationAt(uint64_t Offset) const {
assert(CurrentState == State::Empty &&
"Relocations unavailable in the current function state.");
auto RI = Relocations.find(Offset);
return (RI == Relocations.end()) ? nullptr : &RI->second;
}
/// Returns the raw binary encoding of this function.
ErrorOr<ArrayRef<uint8_t>> getData() const;

View File

@ -21,6 +21,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrAnalysis.h"
@ -44,6 +45,7 @@ class MCSymbol;
class raw_ostream;
namespace bolt {
class BinaryFunction;
/// Different types of indirect branches encountered during disassembly.
enum class IndirectBranchType : char {
@ -286,6 +288,12 @@ public:
initAliases();
}
/// Create and return target-specific MC symbolizer for the \p Function.
virtual std::unique_ptr<MCSymbolizer>
createTargetSymbolizer(BinaryFunction &Function) const {
return nullptr;
}
/// Initialize a new annotation allocator and return its id
AllocatorIdTy initializeNewAnnotationAllocator() {
AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator());

View File

@ -251,6 +251,14 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
BC->HasFixedLoadAddress = !IsPIC;
BC->SymbolicDisAsm = std::unique_ptr<MCDisassembler>(
BC->TheTarget->createMCDisassembler(*BC->STI, *BC->Ctx));
if (!BC->SymbolicDisAsm)
return createStringError(
make_error_code(std::errc::not_supported),
Twine("BOLT-ERROR: no disassembler info for target ", TripleName));
return std::move(BC);
}

View File

@ -1028,6 +1028,8 @@ bool BinaryFunction::disassemble() {
auto &Ctx = BC.Ctx;
auto &MIB = BC.MIB;
BC.SymbolicDisAsm->setSymbolizer(MIB->createTargetSymbolizer(*this));
// Insert a label at the beginning of the function. This will be our first
// basic block.
Labels[0] = Ctx->createNamedTempSymbol("BB0");
@ -1201,9 +1203,9 @@ bool BinaryFunction::disassemble() {
continue;
}
if (!BC.DisAsm->getInstruction(Instruction, Size,
FunctionData.slice(Offset),
AbsoluteInstrAddr, nulls())) {
if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size,
FunctionData.slice(Offset),
AbsoluteInstrAddr, nulls())) {
// Functions with "soft" boundaries, e.g. coming from assembly source,
// can have 0-byte padding at the end.
if (isZeroPaddingAt(Offset))
@ -1243,12 +1245,16 @@ bool BinaryFunction::disassemble() {
break;
}
// Check if our disassembly is correct and matches the assembler output.
if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) {
if (opts::Verbosity >= 1) {
// Disassemble again without the symbolizer and check that the disassembly
// matches the assembler output.
MCInst TempInst;
BC.DisAsm->getInstruction(TempInst, Size, FunctionData.slice(Offset),
AbsoluteInstrAddr, nulls());
if (!BC.validateEncoding(TempInst, FunctionData.slice(Offset, Size))) {
if (opts::Verbosity >= 0) {
errs() << "BOLT-WARNING: internal assembler/disassembler error "
"detected for AVX512 instruction:\n";
BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr);
BC.printInstruction(errs(), TempInst, AbsoluteInstrAddr);
errs() << " in function " << *this << '\n';
}
@ -1341,7 +1347,7 @@ bool BinaryFunction::disassemble() {
if (BC.isAArch64())
handleAArch64IndirectCall(Instruction, Offset);
}
} else {
} else if (BC.isAArch64()) {
// Check if there's a relocation associated with this instruction.
bool UsedReloc = false;
for (auto Itr = Relocations.lower_bound(Offset),
@ -1352,60 +1358,17 @@ bool BinaryFunction::disassemble() {
if (Relocation.isPCRelative())
SymbolValue += getAddress() + Relocation.Offset;
// Process reference to the symbol.
if (BC.isX86())
BC.handleAddressRef(SymbolValue, *this, Relocation.isPCRelative());
int64_t Value = Relocation.Value;
const bool Result = BC.MIB->replaceImmWithSymbolRef(
Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value,
Relocation.Type);
(void)Result;
assert(Result && "cannot replace immediate with relocation");
if (BC.isAArch64() || !Relocation.isPCRelative()) {
int64_t Value = Relocation.Value;
const bool Result = BC.MIB->replaceImmWithSymbolRef(
Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(),
Value, Relocation.Type);
(void)Result;
assert(Result && "cannot replace immediate with relocation");
if (BC.isX86()) {
// Make sure we replaced the correct immediate (instruction
// can have multiple immediate operands).
assert(
truncateToSize(static_cast<uint64_t>(Value),
Relocation::getSizeForType(Relocation.Type)) ==
truncateToSize(Relocation.Value, Relocation::getSizeForType(
Relocation.Type)) &&
"immediate value mismatch in function");
} else if (BC.isAArch64()) {
// For aarch, if we replaced an immediate with a symbol from a
// relocation, we mark it so we do not try to further process a
// pc-relative operand. All we need is the symbol.
UsedReloc = true;
}
} else {
// Check if the relocation matches memop's Disp.
uint64_t TargetAddress;
if (!BC.MIB->evaluateMemOperandTarget(Instruction, TargetAddress,
AbsoluteInstrAddr, Size)) {
errs() << "BOLT-ERROR: PC-relative operand can't be evaluated\n";
exit(1);
}
assert(TargetAddress == Relocation.Value + AbsoluteInstrAddr + Size &&
"Immediate value mismatch detected.");
const MCExpr *Expr = MCSymbolRefExpr::create(
Relocation.Symbol, MCSymbolRefExpr::VK_None, *BC.Ctx);
// Real addend for pc-relative targets is adjusted with a delta
// from relocation placement to the next instruction.
const uint64_t TargetAddend =
Relocation.Addend + Offset + Size - Relocation.Offset;
if (TargetAddend) {
const MCConstantExpr *Offset =
MCConstantExpr::create(TargetAddend, *BC.Ctx);
Expr = MCBinaryExpr::createAdd(Expr, Offset, *BC.Ctx);
}
BC.MIB->replaceMemOperandDisp(
Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor(
Instruction, Expr, *BC.Ctx, 0)));
UsedReloc = true;
}
// For aarch64, if we replaced an immediate with a symbol from a
// relocation, we mark it so we do not try to further process a
// pc-relative operand. All we need is the symbol.
UsedReloc = true;
}
if (MIB->hasPCRelOperand(Instruction) && !UsedReloc)
@ -1432,6 +1395,9 @@ add_instruction:
addInstruction(Offset, std::move(Instruction));
}
// Reset symbolizer for the disassembler.
BC.SymbolicDisAsm->setSymbolizer(nullptr);
clearList(Relocations);
if (!IsSimple) {

View File

@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS
DebugInfoDWARF
Demangle
MC
MCDisassembler
Object
Support
)

View File

@ -2,12 +2,14 @@ set(LLVM_LINK_COMPONENTS
BOLTCore
BOLTUtils
MC
MCDisassembler
Support
X86Desc
)
add_llvm_library(LLVMBOLTTargetX86
X86MCPlusBuilder.cpp
X86MCSymbolizer.cpp
DEPENDS
X86CommonTableGen

View File

@ -13,6 +13,7 @@
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86InstrRelaxTables.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86MCSymbolizer.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Core/MCPlusBuilder.h"
#include "llvm/BinaryFormat/ELF.h"
@ -81,6 +82,11 @@ public:
const MCRegisterInfo *RegInfo)
: MCPlusBuilder(Analysis, Info, RegInfo) {}
std::unique_ptr<MCSymbolizer>
createTargetSymbolizer(BinaryFunction &Function) const override {
return std::make_unique<X86MCSymbolizer>(Function);
}
bool isBranch(const MCInst &Inst) const override {
return Analysis->isBranch(Inst) && !isTailCall(Inst);
}

View File

@ -0,0 +1,107 @@
//===- bolt/Target/X86/X86MCSymbolizer.cpp --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "X86MCSymbolizer.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "bolt/Core/BinaryContext.h"
#include "bolt/Core/BinaryFunction.h"
#include "bolt/Core/MCPlusBuilder.h"
#include "bolt/Core/Relocation.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
#define DEBUG_TYPE "bolt-symbolizer"
namespace llvm {
namespace bolt {
X86MCSymbolizer::~X86MCSymbolizer() {}
bool X86MCSymbolizer::tryAddingSymbolicOperand(
MCInst &Inst, raw_ostream &CStream, int64_t Value, uint64_t InstAddress,
bool IsBranch, uint64_t ImmOffset, uint64_t ImmSize, uint64_t InstSize) {
if (IsBranch)
return false;
// Ignore implicit operands.
if (ImmSize == 0)
return false;
BinaryContext &BC = Function.getBinaryContext();
MCContext *Ctx = BC.Ctx.get();
if (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst))
return false;
/// Add symbolic operand to the instruction with an optional addend.
auto addOperand = [&](const MCSymbol *Symbol, uint64_t Addend) {
const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, *Ctx);
if (Addend)
Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Addend, *Ctx),
*Ctx);
Inst.addOperand(MCOperand::createExpr(Expr));
};
// Check for relocations against the operand.
const uint64_t InstOffset = InstAddress - Function.getAddress();
if (const Relocation *Relocation =
Function.getRelocationAt(InstOffset + ImmOffset)) {
uint64_t SymbolValue = Relocation->Value - Relocation->Addend;
if (Relocation->isPCRelative())
SymbolValue += InstAddress + ImmOffset;
// Process reference to the symbol.
BC.handleAddressRef(SymbolValue, Function, Relocation->isPCRelative());
uint64_t Addend = Relocation->Addend;
// Real addend for pc-relative targets is adjusted with a delta from
// the relocation placement to the next instruction.
if (Relocation->isPCRelative())
Addend += InstOffset + InstSize - Relocation->Offset;
addOperand(Relocation->Symbol, Addend);
return true;
}
// Check if the operand being added is a displacement part of a compound
// memory operand that uses PC-relative addressing. If it is, try to symbolize
// it without relocations.
const int MemOp = BC.MIB->getMemoryOperandNo(Inst);
if (MemOp == -1)
return false;
const unsigned DispOp = MemOp + X86::AddrDisp;
if (Inst.getNumOperands() != DispOp)
return false;
const MCOperand &Base = Inst.getOperand(MemOp + X86::AddrBaseReg);
if (Base.getReg() != BC.MRI->getProgramCounter())
return false;
const MCOperand &Scale = Inst.getOperand(MemOp + X86::AddrScaleAmt);
const MCOperand &Index = Inst.getOperand(MemOp + X86::AddrIndexReg);
if (Scale.getImm() != 0 && Index.getReg() != MCRegister::NoRegister)
return false;
const MCSymbol *TargetSymbol;
uint64_t TargetOffset;
std::tie(TargetSymbol, TargetOffset) =
BC.handleAddressRef(Value, Function, /*IsPCRel*/ true);
addOperand(TargetSymbol, TargetOffset);
return true;
}
void X86MCSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &CStream,
int64_t Value,
uint64_t Address) {}
} // namespace bolt
} // namespace llvm

View File

@ -0,0 +1,43 @@
//===- bolt/Target/X86/X86MCSymbolizer.h ------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef BOLT_CORE_X86MCSYMBOLIZER_H
#define BOLT_CORE_X86MCSYMBOLIZER_H
#include "bolt/Core/BinaryFunction.h"
#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
namespace llvm {
namespace bolt {
class X86MCSymbolizer : public MCSymbolizer {
protected:
BinaryFunction &Function;
public:
X86MCSymbolizer(BinaryFunction &Function)
: MCSymbolizer(*Function.getBinaryContext().Ctx.get(), nullptr),
Function(Function) {}
X86MCSymbolizer(const X86MCSymbolizer &) = delete;
X86MCSymbolizer &operator=(const X86MCSymbolizer &) = delete;
virtual ~X86MCSymbolizer();
bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &CStream,
int64_t Value, uint64_t Address, bool IsBranch,
uint64_t Offset, uint64_t OpSize,
uint64_t InstSize) override;
void tryAddingPcLoadReferenceComment(raw_ostream &CStream, int64_t Value,
uint64_t Address) override;
};
} // namespace bolt
} // namespace llvm
#endif

View File

@ -0,0 +1,42 @@
## Check that BOLT can correctly use relocations to symbolize instruction
## operands when an instruction can have up to two relocations associated
## with it.
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
# RUN: ld.lld %t.o -o %t.exe -q --Tdata=0x80000
# RUN: llvm-bolt %t.exe -relocs -o /dev/null -print-only=_start -print-disasm \
# RUN: | FileCheck %s --check-prefix=CHECK-BOLT
# RUN: llvm-objdump -d --print-imm-hex %t.exe \
# RUN: | FileCheck %s --check-prefix=CHECK-OBJDUMP
.data
.globl VAR
VAR:
.quad
.text
.globl _start
.type _start,@function
_start:
.cfi_startproc
## VAR value is 0x80000. Using relocations, llvm-bolt should correctly
## symbolize the instruction operands.
movq $VAR, 0x80000
# CHECK-BOLT: movq $VAR, 0x80000
# CHECK-OBJDUMP: movq $0x80000, 0x80000
movq $0x80000, VAR
# CHECK-BOLT-NEXT: movq $0x80000, VAR
# CHECK-OBJDUMP-NEXT: movq $0x80000, 0x80000
movq $VAR, VAR
# CHECK-BOLT-NEXT: movq $VAR, VAR
# CHECK-OBJDUMP-NEXT: movq $0x80000, 0x80000
retq
.size _start, .-_start
.cfi_endproc