[llvm.py] Implement interface to enhanced disassembler

This requires a C++ change to EDDisassembler's ctor to function properly
(the llvm::InitializeAll* functions aren't being called currently and
there is no way to call them from Python).

Code is partially tested and works well enough for initial commit. There
are probably many small bugs.

llvm-svn: 152506
This commit is contained in:
Gregory Szorc 2012-03-10 21:05:05 +00:00
parent f4a4d195fb
commit 5b524ff3b9
2 changed files with 626 additions and 0 deletions

View File

@ -0,0 +1,564 @@
#===- disassembler.py - Python LLVM Bindings -----------------*- python -*--===#
#
# The LLVM Compiler Infrastructure
#
# This file is distributed under the University of Illinois Open Source
# License. See LICENSE.TXT for details.
#
#===------------------------------------------------------------------------===#
from abc import ABCMeta
from abc import abstractmethod
from ctypes import CFUNCTYPE
from ctypes import POINTER
from ctypes import byref
from ctypes import c_char_p
from ctypes import c_int
from ctypes import c_ubyte
from ctypes import c_uint64
from ctypes import c_uint
from ctypes import c_void_p
from ctypes import memmove
from .common import CachedProperty
from .common import LLVMObject
from .common import c_object_p
from .common import get_library
__all__ = [
'DisassemblerByteArraySource',
'DisassemblerFileSource',
'DisassemblerSource',
'Disassembler',
'Instruction',
'Operand',
'Token',
]
callbacks = {}
class DisassemblerSource:
"""Abstract base class for disassembler input.
This defines the interface to which inputs to the disassembler must
conform.
Basically, the disassembler input is a read-only sequence of a finite
length.
"""
__metaclass__ = ABCMeta
@abstractmethod
def __len__(self):
"""Returns the number of bytes that are available for input."""
pass
@abstractmethod
def get_byte(self, address):
"""Returns the byte at the specified address."""
pass
@abstractmethod
def start_address(self):
"""Returns the address at which to start fetch bytes, as a long."""
pass
class DisassemblerByteArraySource(DisassemblerSource):
"""A disassembler source for byte arrays."""
def __init__(self, b):
self._array = b
def __len__(self):
return len(self._array)
def get_byte(self, address):
return self._array[address]
def start_address(self):
return 0
class DisassemblerFileSource(DisassemblerSource):
"""A disassembler source for file segments.
This allows you to feed in segments of a file into a Disassembler.
"""
def __init__(self, filename, start_offset, length=None, end_offset=None,
start_address=None):
"""Create a new source from a file.
A source begins at a specified byte offset and can be defined in terms
of byte length of the end byte offset.
"""
if length is None and end_offset is None:
raise Exception('One of length or end_offset must be defined.')
self._start_address = start_address
if self._start_address is None:
self._start_address = 0
count = length
if length is None:
count = end_offset - start_offset
with open(filename, 'rb') as fh:
fh.seek(start_offset)
# FIXME handle case where read bytes != requested
self._buf = fh.read(count)
def __len__(self):
return len(self._buf)
def get_byte(self, address):
return self._buf[address - self._start_address]
def start_address(self):
return self._start_address
class Disassembler(LLVMObject):
"""Interface to LLVM's enhanced disassembler.
The API is slightly different from the C API in that we tightly couple a
disassembler instance to an input source. This saves an extra level of
abstraction and makes the Python implementation easier.
"""
SYNTAX_X86_INTEL = 0
SYNTAX_X86_ATT = 1
SYNTAX_ARM_UAL = 2
def __init__(self, triple, source, syntax=0):
"""Create a new disassembler instance.
Arguments:
triple -- str target type (e.g. x86_64-apple-darwin10)
source -- DisassemblerSource instance to be fed into this disassembler.
syntax -- The assembly syntax to use. One of the SYNTAX_* class
constants. e.g. EnhancedDisassembler.SYNTAX_X86_INTEL
"""
assert isinstance(source, DisassemblerSource)
ptr = c_object_p()
result = lib.EDGetDisassembler(byref(ptr), c_char_p(triple),
c_int(syntax))
if result != 0:
raise Exception('Non-0 return code.')
LLVMObject.__init__(self, ptr)
self._source = source
def get_instructions(self):
"""Obtain the instructions from the input.
This is a generator for Instruction instances.
By default, this will return instructions for the entire source which
has been defined. It does this by querying the source's start_address()
method and continues to request instructions until len(source) is
exhausted.
"""
# We currently obtain 1 instruction at a time because it is easiest.
# This serves as our EDByteReaderCallback. It is a proxy between C and
# the Python DisassemblerSource.
def byte_reader(dest, address, arg):
try:
byte = self._source.get_byte(address)
memmove(dest, byte, 1)
return 0
except:
return -1
address = self._source.start_address()
end_address = address + len(self._source)
cb = callbacks['byte_reader'](byte_reader)
while address < end_address:
ptr = c_object_p()
result = lib.EDCreateInsts(byref(ptr), c_uint(1), self, cb,
address, c_void_p(None))
if result != 1:
raise Exception('Error obtaining instruction at address %d' %
address)
instruction = Instruction(ptr, self)
yield instruction
address += instruction.byte_size
class Instruction(LLVMObject):
"""Represents an individual instruction.
Instruction instances are obtained from Disassembler.get_instructions().
"""
def __init__(self, ptr, disassembler):
"""Create a new instruction.
Instructions are created from within this module. You should have no
need to call this from outside this module.
"""
assert isinstance(ptr, c_object_p)
assert isinstance(disassembler, Disassembler)
LLVMObject.__init__(self, ptr, disposer=lib.EDReleaseInst)
self._disassembler = disassembler
def __str__(self):
s = c_char_p(None)
result = lib.EDGetInstString(byref(s), self)
if result != 0:
raise Exception('Non-0 return code.')
return s.value
@CachedProperty
def byte_size(self):
result = lib.EDInstByteSize(self)
if result == -1:
raise Exception('Error code returned.')
return result
@CachedProperty
def id(self):
i = c_uint()
result = lib.EDInstID(byref(i), self)
if result != 0:
raise Exception('Non-0 return code.')
return i.value
@CachedProperty
def is_branch(self):
result = lib.EDInstIsBranch(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_move(self):
result = lib.EDInstIsMove(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def branch_target_id(self):
result = lib.EDBranchTargetID(self)
if result == -1:
raise Exception('Error code returned.')
return result
@CachedProperty
def move_source_id(self):
result = lib.EDMoveSourceID(self)
if result == -1:
raise Exception('Error code returned.')
return result
def get_tokens(self):
"""Obtain the tokens in this instruction.
This is a generator for Token instances.
"""
count = lib.EDNumTokens(self)
if count == -1:
raise Exception('Error code returned.')
for i in range(0, count):
ptr = c_object_p()
result = lib.EDGetToken(byref(ptr), self, c_int(i))
if result != 0:
raise Exception('Non-0 return code.')
yield Token(ptr, self)
def get_operands(self):
"""Obtain the operands in this instruction.
This is a generator for Operand instances.
"""
count = lib.EDNumOperands(self)
if count == -1:
raise Exception('Error code returned.')
for i in range(0, count):
ptr = c_object_p()
result = lib.EDGetOperand(byref(ptr), self, c_int(i))
if result != 0:
raise Exception('Non-0 return code.')
yield Operand(ptr, self)
class Token(LLVMObject):
def __init__(self, ptr, instruction):
assert isinstance(ptr, c_object_p)
assert isinstance(instruction, Instruction)
LLVMObject.__init__(self, ptr)
self._instruction = instruction
def __str__(self):
s = c_char_p(None)
result = lib.EDGetTokenString(byref(s), self)
if result != 0:
raise Exception('Non-0 return code.')
return s.value
@CachedProperty
def operand_index(self):
result = lib.EDOperandIndexForToken(self)
if result == -1:
raise Exception('Error code returned.')
return result
@CachedProperty
def is_whitespace(self):
result = lib.EDTokenIsWhitespace(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_punctuation(self):
result = lib.EDTokenIsPunctuation(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_opcode(self):
result = lib.EDTokenIsOpcode(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_literal(self):
result = lib.EDTokenIsLiteral(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_register(self):
result = lib.EDTokenIsRegister(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_negative_literal(self):
result = lib.EDTokenIsNegativeLiteral(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def absolute_value(self):
value = c_uint64()
result = lib.EDLiteralTokenAbsoluteValue(byref(value), self)
if result != 0:
raise Exception('Non-0 return code.')
return value
@CachedProperty
def register_value(self):
value = c_uint()
result = lib.EDRegisterTokenValue(byref(value), self)
if result != 0:
raise Exception('Non-0 return code.')
return value
class Operand(LLVMObject):
"""Represents an operand in an instruction.
FIXME support register evaluation.
"""
def __init__(self, ptr, instruction):
assert isinstance(ptr, c_object_p)
assert isinstance(instruction, Instruction)
LLVMObject.__init__(self, ptr)
self._instruction = instruction
@CachedProperty
def is_register(self):
result = lib.EDOperandIsRegister(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_immediate(self):
result = lib.EDOperandIsImmediate(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def is_memory(self):
result = lib.EDOperandIsMemory(self)
if result == -1:
raise Exception('Error code returned.')
return result > 0
@CachedProperty
def register_value(self):
value = c_uint()
result = lib.EDRegisterOperandValue(byref(value), self)
if result != 0:
raise Exception('Non-0 return code.')
return value
@CachedProperty
def immediate_value(self):
value = c_uint64()
result = lib.EDImmediateOperandValue(byref(value), self)
if result != 0:
raise Exception('Non-0 return code.')
return value
def register_library(library):
library.EDGetDisassembler.argtypes = [POINTER(c_object_p), c_char_p, c_int]
library.EDGetDisassembler.restype = c_int
library.EDGetRegisterName.argtypes = [POINTER(c_char_p), Disassembler,
c_uint]
library.EDGetRegisterName.restype = c_int
library.EDRegisterIsStackPointer.argtypes = [Disassembler, c_uint]
library.EDRegisterIsStackPointer.restype = c_int
library.EDRegisterIsProgramCounter.argtypes = [Disassembler, c_uint]
library.EDRegisterIsProgramCounter.restype = c_int
library.EDCreateInsts.argtypes = [POINTER(c_object_p), c_uint,
Disassembler, callbacks['byte_reader'], c_uint64, c_void_p]
library.EDCreateInsts.restype = c_uint
library.EDReleaseInst.argtypes = [Instruction]
library.EDInstByteSize.argtypes = [Instruction]
library.EDInstByteSize.restype = c_int
library.EDGetInstString.argtypes = [POINTER(c_char_p), Instruction]
library.EDGetInstString.restype = c_int
library.EDInstID.argtypes = [POINTER(c_uint), Instruction]
library.EDInstID.restype = c_int
library.EDInstIsBranch.argtypes = [Instruction]
library.EDInstIsBranch.restype = c_int
library.EDInstIsMove.argtypes = [Instruction]
library.EDInstIsMove.restype = c_int
library.EDBranchTargetID.argtypes = [Instruction]
library.EDBranchTargetID.restype = c_int
library.EDMoveSourceID.argtypes = [Instruction]
library.EDMoveSourceID.restype = c_int
library.EDMoveTargetID.argtypes = [Instruction]
library.EDMoveTargetID.restype = c_int
library.EDNumTokens.argtypes = [Instruction]
library.EDNumTokens.restype = c_int
library.EDGetToken.argtypes = [POINTER(c_object_p), Instruction, c_int]
library.EDGetToken.restype = c_int
library.EDGetTokenString.argtypes = [POINTER(c_char_p), Token]
library.EDGetTokenString.restype = c_int
library.EDOperandIndexForToken.argtypes = [Token]
library.EDOperandIndexForToken.restype = c_int
library.EDTokenIsWhitespace.argtypes = [Token]
library.EDTokenIsWhitespace.restype = c_int
library.EDTokenIsPunctuation.argtypes = [Token]
library.EDTokenIsPunctuation.restype = c_int
library.EDTokenIsOpcode.argtypes = [Token]
library.EDTokenIsOpcode.restype = c_int
library.EDTokenIsLiteral.argtypes = [Token]
library.EDTokenIsLiteral.restype = c_int
library.EDTokenIsRegister.argtypes = [Token]
library.EDTokenIsRegister.restype = c_int
library.EDTokenIsNegativeLiteral.argtypes = [Token]
library.EDTokenIsNegativeLiteral.restype = c_int
library.EDLiteralTokenAbsoluteValue.argtypes = [POINTER(c_uint64), Token]
library.EDLiteralTokenAbsoluteValue.restype = c_int
library.EDRegisterTokenValue.argtypes = [POINTER(c_uint), Token]
library.EDRegisterTokenValue.restype = c_int
library.EDNumOperands.argtypes = [Instruction]
library.EDNumOperands.restype = c_int
library.EDGetOperand.argtypes = [POINTER(c_object_p), Instruction, c_int]
library.EDGetOperand.restype = c_int
library.EDOperandIsRegister.argtypes = [Operand]
library.EDOperandIsRegister.restype = c_int
library.EDOperandIsImmediate.argtypes = [Operand]
library.EDOperandIsImmediate.restype = c_int
library.EDOperandIsMemory.argtypes = [Operand]
library.EDOperandIsMemory.restype = c_int
library.EDRegisterOperandValue.argtypes = [POINTER(c_uint), Operand]
library.EDRegisterOperandValue.restype = c_int
library.EDImmediateOperandValue.argtypes = [POINTER(c_uint64), Operand]
library.EDImmediateOperandValue.restype = c_int
library.EDEvaluateOperand.argtypes = [c_uint64, Operand,
callbacks['register_reader'], c_void_p]
library.EDEvaluateOperand.restype = c_int
# Enhanced disassembler.
callbacks['byte_reader'] = CFUNCTYPE(c_int, POINTER(c_ubyte), c_uint64,
c_void_p)
callbacks['register_reader'] = CFUNCTYPE(c_int, POINTER(c_uint64), c_uint,
c_void_p)
lib = get_library()
register_library(lib)

View File

@ -0,0 +1,62 @@
from unittest import expectedFailure
from unittest import skip
from .base import TestBase
from ..disassembler import DisassemblerByteArraySource
from ..disassembler import DisassemblerFileSource
from ..disassembler import Disassembler
from ..object import ObjectFile
class TestDisassembler(TestBase):
def test_simple(self):
sequence = '\x67\xe3\x81' # jcxz -127
triple = 'i686-apple-darwin9'
source = DisassemblerByteArraySource(sequence)
disassembler = Disassembler(triple, source)
instructions = list(disassembler.get_instructions())
self.assertEqual(len(instructions), 1)
i = instructions[0]
self.assertEqual(str(i), '\tjcxz\t-127\n')
self.assertEqual(i.byte_size, 3)
self.assertEqual(i.id, 1032)
self.assertTrue(i.is_branch)
self.assertFalse(i.is_move)
self.assertEqual(i.branch_target_id, 0)
tokens = list(i.get_tokens())
self.assertEqual(len(tokens), 4)
token = tokens[0]
self.assertEqual(str(token), 'jcxz')
self.assertFalse(token.is_whitespace)
self.assertFalse(token.is_punctuation)
self.assertTrue(token.is_opcode)
self.assertFalse(token.is_literal)
self.assertFalse(token.is_register)
self.assertTrue(tokens[1].is_whitespace)
operands = list(i.get_operands())
self.assertEqual(len(operands), 1)
# TODO implement operand tests
@skip('This test is horribly broken and probably not even correct.')
def test_read_instructions(self):
filename = self.get_test_binary()
o = ObjectFile(filename=filename)
for symbol in o.get_symbols():
address = symbol.address
offset = symbol.file_offset
size = symbol.size
source = DisassemblerFileSource(filename, offset, length=size,
start_address=address)
disassembler = Disassembler('x86-generic-gnu-linux', source)
for instruction in disassembler.get_instructions():
print instruction