Flush out enough of llvm-objdump’s SymbolizerSymbolLookUp() for Mach-O files to

get the literal string “Hello world” printed as a comment on the instruction
that loads the pointer to it. For now this is just for x86_64. So for object
files with relocation entries it produces things like:

	leaq	L_.str(%rip), %rax      ## literal pool for: "Hello world\n"

and similar for fully linked images like executables:

	leaq	0x4f(%rip), %rax        ## literal pool for: "Hello world\n"

Also to allow testing against darwin’s otool(1), I hooked up the existing 
-no-show-raw-insn option to the Mach-O parser code, added the new Mach-O
only -full-leading-addr option to match otool(1)'s printing of addresses and
also added the new -print-imm-hex option.

llvm-svn: 218423
This commit is contained in:
Kevin Enderby 2014-09-24 23:08:22 +00:00
parent 8fdd214f67
commit 5283ad43fd
4 changed files with 312 additions and 38 deletions

View File

@ -1,4 +1,7 @@
// RUN: llvm-objdump -d -m %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s -check-prefix=OBJ
// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-x86_64 | FileCheck %s -check-prefix=EXE
CHECK: leaq L_.str(%rip), %rax
CHECK: callq _printf
OBJ: 0000000000000008 leaq L_.str(%rip), %rax ## literal pool for: "Hello world\n"
OBJ: 0000000000000026 callq _printf
EXE: 0000000100000f38 leaq 0x4f(%rip), %rax ## literal pool for: "Hello world\n"

View File

@ -36,6 +36,7 @@
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/MachO.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/raw_ostream.h"
@ -51,6 +52,14 @@ static cl::opt<bool>
static cl::opt<std::string>
DSYMFile("dsym", cl::desc("Use .dSYM file for debug info"));
static cl::opt<bool>
FullLeadingAddr("full-leading-addr",
cl::desc("Print full leading address"));
static cl::opt<bool>
PrintImmHex("print-imm-hex",
cl::desc("Use hex format for immediate values"));
static std::string ThumbTripleName;
static const Target *GetTarget(const MachOObjectFile *MachOObj,
@ -225,11 +234,14 @@ void llvm::DisassembleInputMachO(StringRef Filename) {
DisassembleInputMachO2(Filename, MachOOF.get());
}
typedef DenseMap<uint64_t, StringRef> SymbolAddressMap;
// The block of info used by the Symbolizer call backs.
struct DisassembleInfo {
bool verbose;
MachOObjectFile *O;
SectionRef S;
SymbolAddressMap *AddrMap;
};
// SymbolizerGetOpInfo() is the operand information call back function.
@ -301,7 +313,6 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
// is the offset from the external symbol.
if (info->O->getAnyRelocationPCRel(RE))
op_info->Value -= Pc + Offset + Size;
// SymbolRef Symbol = (*info->Relocs)[Idx].second;
StringRef SymName;
Symbol.getName(SymName);
const char *name = SymName.data();
@ -343,8 +354,142 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
}
}
// GuessCstringPointer is passed the address of what might be a pointer to a
// literal string in a cstring section. If that address is in a cstring section
// it returns a pointer to that string. Else it returns nullptr.
const char *GuessCstringPointer(uint64_t ReferenceValue,
struct DisassembleInfo *info) {
uint32_t LoadCommandCount = info->O->getHeader().ncmds;
MachOObjectFile::LoadCommandInfo Load = info->O->getFirstLoadCommandInfo();
for (unsigned I = 0;; ++I) {
if (Load.C.cmd == MachO::LC_SEGMENT_64) {
MachO::segment_command_64 Seg = info->O->getSegment64LoadCommand(Load);
for (unsigned J = 0; J < Seg.nsects; ++J) {
MachO::section_64 Sec = info->O->getSection64(Load, J);
uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
if (section_type == MachO::S_CSTRING_LITERALS &&
ReferenceValue >= Sec.addr &&
ReferenceValue < Sec.addr + Sec.size) {
uint64_t sect_offset = ReferenceValue - Sec.addr;
uint64_t object_offset = Sec.offset + sect_offset;
StringRef MachOContents = info->O->getData();
uint64_t object_size = MachOContents.size();
const char *object_addr = (const char *)MachOContents.data();
if (object_offset < object_size) {
const char *name = object_addr + object_offset;
return name;
} else {
return nullptr;
}
}
}
} else if (Load.C.cmd == MachO::LC_SEGMENT) {
MachO::segment_command Seg = info->O->getSegmentLoadCommand(Load);
for (unsigned J = 0; J < Seg.nsects; ++J) {
MachO::section Sec = info->O->getSection(Load, J);
uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
if (section_type == MachO::S_CSTRING_LITERALS &&
ReferenceValue >= Sec.addr &&
ReferenceValue < Sec.addr + Sec.size) {
uint64_t sect_offset = ReferenceValue - Sec.addr;
uint64_t object_offset = Sec.offset + sect_offset;
StringRef MachOContents = info->O->getData();
uint64_t object_size = MachOContents.size();
const char *object_addr = (const char *)MachOContents.data();
if (object_offset < object_size) {
const char *name = object_addr + object_offset;
return name;
} else {
return nullptr;
}
}
}
}
if (I == LoadCommandCount - 1)
break;
else
Load = info->O->getNextLoadCommandInfo(Load);
}
return nullptr;
}
// GuessLiteralPointer returns a string which for the item in the Mach-O file
// for the address passed in as ReferenceValue for printing as a comment with
// the instruction and also returns the corresponding type of that item
// indirectly through ReferenceType.
//
// If ReferenceValue is an address of literal cstring then a pointer to the
// cstring is returned and ReferenceType is set to
// LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr .
//
// TODO: other literals such as Objective-C CFStrings refs, Selector refs,
// Message refs, Class refs and a Symbol address in a literal pool are yet
// to be done here.
const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC,
uint64_t *ReferenceType,
struct DisassembleInfo *info) {
// TODO: This rouine's code is only for an x86_64 Mach-O file for now.
unsigned int Arch = info->O->getArch();
if (Arch != Triple::x86_64)
return nullptr;
// First see if there is an external relocation entry at the ReferencePC.
uint64_t sect_addr;
info->S.getAddress(sect_addr);
uint64_t sect_offset = ReferencePC - sect_addr;
bool reloc_found = false;
DataRefImpl Rel;
MachO::any_relocation_info RE;
bool isExtern = false;
SymbolRef Symbol;
for (const RelocationRef &Reloc : info->S.relocations()) {
uint64_t RelocOffset;
Reloc.getOffset(RelocOffset);
if (RelocOffset == sect_offset) {
Rel = Reloc.getRawDataRefImpl();
RE = info->O->getRelocation(Rel);
if (info->O->isRelocationScattered(RE))
continue;
isExtern = info->O->getPlainRelocationExternal(RE);
if (isExtern) {
symbol_iterator RelocSym = Reloc.getSymbol();
Symbol = *RelocSym;
}
reloc_found = true;
break;
}
}
// If there is an external relocation entry for a symbol in a section
// then used that symbol's value for the value of the reference.
if (reloc_found && isExtern) {
if (info->O->getAnyRelocationPCRel(RE)) {
unsigned Type = info->O->getAnyRelocationType(RE);
if (Type == MachO::X86_64_RELOC_SIGNED) {
Symbol.getAddress(ReferenceValue);
}
}
}
// TODO: the code to look for other literals such as Objective-C CFStrings
// refs, Selector refs, Message refs, Class refs will be added here.
const char *name = GuessCstringPointer(ReferenceValue, info);
if (name) {
// TODO: note when the code is added above for Selector refs and Message
// refs we will need check for that here and set the ReferenceType
// accordingly.
*ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr;
return name;
}
// TODO: look for an indirect symbol with this ReferenceValue which is in
// a literal pool.
return nullptr;
}
// SymbolizerSymbolLookUp is the symbol lookup function passed when creating
// the Symbolizer. It looks up the SymbolValue using the info passed via the
// the Symbolizer. It looks up the ReferenceValue using the info passed via the
// pointer to the struct DisassembleInfo that was passed when MCSymbolizer
// is created and returns the symbol name that matches the ReferenceValue or
// nullptr if none. The ReferenceType is passed in for the IN type of
@ -364,7 +509,7 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
// stub is returned indirectly through ReferenceName and then ReferenceType is
// set to LLVMDisassembler_ReferenceType_Out_SymbolStub.
//
// When this is called with an value loaded via a PC relative load then
// When this is called with an value loaded via a PC relative load then
// ReferenceType will be LLVMDisassembler_ReferenceType_In_PCrel_Load then the
// SymbolValue is checked to be an address of literal pointer, symbol pointer,
// or an Objective-C meta data reference. If so the output ReferenceType is
@ -374,20 +519,89 @@ const char *SymbolizerSymbolLookUp(void *DisInfo, uint64_t ReferenceValue,
uint64_t ReferencePC,
const char **ReferenceName) {
struct DisassembleInfo *info = (struct DisassembleInfo *)DisInfo;
*ReferenceName = nullptr;
*ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
unsigned int Arch = info->O->getArch();
if (Arch == Triple::x86) {
return nullptr;
} else if (Arch == Triple::x86_64) {
return nullptr;
} else if (Arch == Triple::arm) {
return nullptr;
} else if (Arch == Triple::aarch64) {
return nullptr;
} else {
// If no verbose symbolic information is wanted then just return nullptr.
if (info->verbose == false) {
*ReferenceName = nullptr;
*ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
return nullptr;
}
const char *SymbolName = nullptr;
StringRef name = info->AddrMap->lookup(ReferenceValue);
if (!name.empty())
SymbolName = name.data();
if (*ReferenceType == LLVMDisassembler_ReferenceType_In_PCrel_Load) {
*ReferenceName = GuessLiteralPointer(ReferenceValue, ReferencePC,
ReferenceType, info);
if (*ReferenceName == nullptr)
*ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
// TODO: other types of references to be added.
} else {
*ReferenceName = nullptr;
*ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
}
return SymbolName;
}
//
// This is the memory object used by DisAsm->getInstruction() which has its
// BasePC. This then allows the 'address' parameter to getInstruction() to
// be the actual PC of the instruction. Then when a branch dispacement is
// added to the PC of an instruction, the 'ReferenceValue' passed to the
// SymbolizerSymbolLookUp() routine is the correct target addresses. As in
// the case of a fully linked Mach-O file where a section being disassembled
// generally not linked at address zero.
//
class DisasmMemoryObject : public MemoryObject {
uint8_t *Bytes;
uint64_t Size;
uint64_t BasePC;
public:
DisasmMemoryObject(uint8_t *bytes, uint64_t size, uint64_t basePC) :
Bytes(bytes), Size(size), BasePC(basePC) {}
uint64_t getBase() const override { return BasePC; }
uint64_t getExtent() const override { return Size; }
int readByte(uint64_t Addr, uint8_t *Byte) const override {
if (Addr - BasePC >= Size)
return -1;
*Byte = Bytes[Addr - BasePC];
return 0;
}
};
/// \brief Emits the comments that are stored in the CommentStream.
/// Each comment in the CommentStream must end with a newline.
static void emitComments(raw_svector_ostream &CommentStream,
SmallString<128> &CommentsToEmit,
formatted_raw_ostream &FormattedOS,
const MCAsmInfo &MAI) {
// Flush the stream before taking its content.
CommentStream.flush();
StringRef Comments = CommentsToEmit.str();
// Get the default information for printing a comment.
const char *CommentBegin = MAI.getCommentString();
unsigned CommentColumn = MAI.getCommentColumn();
bool IsFirst = true;
while (!Comments.empty()) {
if (!IsFirst)
FormattedOS << '\n';
// Emit a line of comments.
FormattedOS.PadToColumn(CommentColumn);
size_t Position = Comments.find('\n');
FormattedOS << CommentBegin << ' ' << Comments.substr(0, Position);
// Move after the newline character.
Comments = Comments.substr(Position + 1);
IsFirst = false;
}
FormattedOS.flush();
// Tell the comment stream that the vector changed underneath it.
CommentsToEmit.clear();
CommentStream.resync();
}
static void DisassembleInputMachO2(StringRef Filename,
@ -445,6 +659,12 @@ static void DisassembleInputMachO2(StringRef Filename,
int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
AsmPrinterVariant, *AsmInfo, *InstrInfo, *MRI, *STI));
// Set the display preference for hex vs. decimal immediates.
IP->setPrintImmHex(PrintImmHex);
// Comment stream and backing vector.
SmallString<128> CommentsToEmit;
raw_svector_ostream CommentStream(CommentsToEmit);
IP->setCommentStream(CommentStream);
if (!InstrAnalysis || !AsmInfo || !STI || !DisAsm || !IP) {
errs() << "error: couldn't initialize disassembler for target "
@ -467,11 +687,13 @@ static void DisassembleInputMachO2(StringRef Filename,
ThumbTarget->createMCSubtargetInfo(ThumbTripleName, MCPU, FeaturesStr));
ThumbCtx.reset(new MCContext(ThumbAsmInfo.get(), ThumbMRI.get(), nullptr));
ThumbDisAsm.reset(ThumbTarget->createMCDisassembler(*ThumbSTI, *ThumbCtx));
// TODO: add MCSymbolizer here for the ThumbTarget like above for TheTarget.
// TODO: add MCSymbolizer here for the ThumbTarget like above for TheTarget.
int ThumbAsmPrinterVariant = ThumbAsmInfo->getAssemblerDialect();
ThumbIP.reset(ThumbTarget->createMCInstPrinter(
ThumbAsmPrinterVariant, *ThumbAsmInfo, *ThumbInstrInfo, *ThumbMRI,
*ThumbSTI));
// Set the display preference for hex vs. decimal immediates.
ThumbIP->setPrintImmHex(PrintImmHex);
}
if (ThumbTarget && (!ThumbInstrAnalysis || !ThumbAsmInfo || !ThumbSTI ||
@ -564,7 +786,10 @@ static void DisassembleInputMachO2(StringRef Filename,
StringRef Bytes;
Sections[SectIdx].getContents(Bytes);
StringRefMemoryObject memoryObject(Bytes);
uint64_t SectAddress = 0;
Sections[SectIdx].getAddress(SectAddress);
DisasmMemoryObject MemoryObject((uint8_t *)Bytes.data(), Bytes.size(),
SectAddress);
bool symbolTableWorked = false;
// Parse relocations.
@ -581,10 +806,26 @@ static void DisassembleInputMachO2(StringRef Filename,
}
array_pod_sort(Relocs.begin(), Relocs.end());
// Create a map of symbol addresses to symbol names for use by
// the SymbolizerSymbolLookUp() routine.
SymbolAddressMap AddrMap;
for (const SymbolRef &Symbol : MachOOF->symbols()) {
SymbolRef::Type ST;
Symbol.getType(ST);
if (ST == SymbolRef::ST_Function || ST == SymbolRef::ST_Data ||
ST == SymbolRef::ST_Other) {
uint64_t Address;
Symbol.getAddress(Address);
StringRef SymName;
Symbol.getName(SymName);
AddrMap[Address] = SymName;
}
}
// Set up the block of info used by the Symbolizer call backs.
SymbolizerInfo.verbose = true;
SymbolizerInfo.O = MachOOF;
SymbolizerInfo.S = Sections[SectIdx];
SymbolizerInfo.AddrMap = &AddrMap;
// Disassemble symbol by symbol.
for (unsigned SymIdx = 0; SymIdx != Symbols.size(); SymIdx++) {
@ -643,14 +884,22 @@ static void DisassembleInputMachO2(StringRef Filename,
for (uint64_t Index = Start; Index < End; Index += Size) {
MCInst Inst;
uint64_t SectAddress = 0;
Sections[SectIdx].getAddress(SectAddress);
outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
uint64_t PC = SectAddress + Index;
if (FullLeadingAddr) {
if (MachOOF->is64Bit())
outs() << format("%016" PRIx64, PC);
else
outs() << format("%08" PRIx64, PC);
} else {
outs() << format("%8" PRIx64 ":", PC);
}
if (!NoShowRawInsn)
outs() << "\t";
// Check the data in code table here to see if this is data not an
// instruction to be disassembled.
DiceTable Dice;
Dice.push_back(std::make_pair(SectAddress + Index, DiceRef()));
Dice.push_back(std::make_pair(PC, DiceRef()));
dice_table_iterator DTI = std::search(Dices.begin(), Dices.end(),
Dice.begin(), Dice.end(),
compareDiceTableEntries);
@ -664,24 +913,33 @@ static void DisassembleInputMachO2(StringRef Filename,
continue;
}
SmallVector<char, 64> AnnotationsBytes;
raw_svector_ostream Annotations(AnnotationsBytes);
bool gotInst;
if (isThumb)
gotInst = ThumbDisAsm->getInstruction(Inst, Size, memoryObject, Index,
DebugOut, nulls());
gotInst = ThumbDisAsm->getInstruction(Inst, Size, MemoryObject, PC,
DebugOut, Annotations);
else
gotInst = DisAsm->getInstruction(Inst, Size, memoryObject, Index,
DebugOut, nulls());
gotInst = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
DebugOut, Annotations);
if (gotInst) {
DumpBytes(StringRef(Bytes.data() + Index, Size));
if (!NoShowRawInsn) {
DumpBytes(StringRef(Bytes.data() + Index, Size));
}
formatted_raw_ostream FormattedOS(outs());
Annotations.flush();
StringRef AnnotationsStr = Annotations.str();
if (isThumb)
ThumbIP->printInst(&Inst, outs(), "");
ThumbIP->printInst(&Inst, FormattedOS, AnnotationsStr);
else
IP->printInst(&Inst, outs(), "");
IP->printInst(&Inst, FormattedOS, AnnotationsStr);
emitComments(CommentStream, CommentsToEmit, FormattedOS, *AsmInfo);
// Print debug info.
if (diContext) {
DILineInfo dli =
diContext->getLineInfoForAddress(SectAddress + Index);
diContext->getLineInfoForAddress(PC);
// Print valid line info if it changed.
if (dli != lastLine && dli.Line != 0)
outs() << "\t## " << dli.FileName << ':' << dli.Line << ':'
@ -706,10 +964,21 @@ static void DisassembleInputMachO2(StringRef Filename,
for (uint64_t Index = 0; Index < SectSize; Index += InstSize) {
MCInst Inst;
if (DisAsm->getInstruction(Inst, InstSize, memoryObject, Index,
uint64_t PC = SectAddress + Index;
if (DisAsm->getInstruction(Inst, InstSize, MemoryObject, PC,
DebugOut, nulls())) {
outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
DumpBytes(StringRef(Bytes.data() + Index, InstSize));
if (FullLeadingAddr) {
if (MachOOF->is64Bit())
outs() << format("%016" PRIx64, PC);
else
outs() << format("%08" PRIx64, PC);
} else {
outs() << format("%8" PRIx64 ":", PC);
}
if (!NoShowRawInsn) {
outs() << "\t";
DumpBytes(StringRef(Bytes.data() + Index, InstSize));
}
IP->printInst(&Inst, outs(), "");
outs() << "\n";
} else {

View File

@ -128,9 +128,10 @@ llvm::MAttrs("mattr",
cl::desc("Target specific attributes"),
cl::value_desc("a1,+a2,-a3,..."));
static cl::opt<bool>
NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling instructions, "
"do not print the instruction bytes."));
cl::opt<bool>
llvm::NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling "
"instructions, do not print "
"the instruction bytes."));
static cl::opt<bool>
UnwindInfo("unwind-info", cl::desc("Display unwind information"));

View File

@ -27,6 +27,7 @@ extern cl::opt<std::string> TripleName;
extern cl::opt<std::string> ArchName;
extern cl::opt<std::string> MCPU;
extern cl::list<std::string> MAttrs;
extern cl::opt<bool> NoShowRawInsn;
// Various helper functions.
bool error(std::error_code ec);