Move refactored ingest from 3.6 to 3.x...

We are getting away from working with bytecode in favor of
working with full-fledged structured instructions

Up next: find_jump_targets()
This commit is contained in:
rocky 2017-11-06 09:43:49 -05:00
parent 6bffae91fa
commit 124267849c
3 changed files with 27 additions and 313 deletions

View File

@ -50,8 +50,8 @@ check-3.6: check-bytecode
$(PYTHON) test_pythonlib.py --bytecode-3.6 --weak-verify $(COMPILE)
# FIXME
#: this is called when running under pypy3.5-5.8.0
5.8:
#: this is called when running under pypy3.5-5.8.0 or pypy2-5.6.0
5.8 5.6:
#: Check deparsing only, but from a different Python version
check-disasm:
@ -71,7 +71,7 @@ check-bytecode-2:
check-bytecode-3:
$(PYTHON) test_pythonlib.py --bytecode-3.0 \
--bytecode-3.1 --bytecode-3.2 --bytecode-3.3 \
--bytecode-3.4 --bytecode-3.5 --bytecode-pypy3.2
--bytecode-3.4 --bytecode-3.5 --bytecode-3.6 --bytecode-pypy3.2
#: Check deparsing bytecode that works running Python 2 and Python 3
check-bytecode: check-bytecode-3

View File

@ -27,7 +27,7 @@ from array import array
from uncompyle6.scanner import Scanner
from xdis.code import iscode
from xdis.bytecode import Bytecode, op_has_argument, instruction_size
from xdis.bytecode import Bytecode, instruction_size
from xdis.util import code2num
from uncompyle6.scanner import Token, parse_fn_counts
@ -144,19 +144,24 @@ class Scanner3(Scanner):
def ingest(self, co, classname=None, code_objects={}, show_asm=None):
"""
Pick out tokens from an uncompyle6 code object, and transform them,
returning a list of uncompyle6 'Token's.
returning a list of uncompyle6 Token's.
The transformations are made to assist the deparsing grammar.
Specificially:
- various types of LOAD_CONST's are categorized in terms of what they load
- COME_FROM instructions are added to assist parsing control structures
- MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
- some EXTENDED_ARGS instructions are removed
Also, when we encounter certain tokens, we add them to a set which will cause custom
grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
cause specific rules for the specific number of arguments they take.
"""
# FIXME: remove this when all subsidiary functions have been removed.
# We should be able to get everything from the self.insts list.
self.code = array('B', co.co_code)
show_asm = self.show_asm if not show_asm else show_asm
# show_asm = 'both'
if show_asm in ('both', 'before'):
@ -175,7 +180,6 @@ class Scanner3(Scanner):
if self.is_pypy:
customize['PyPy'] = 0
self.code = array('B', co.co_code)
self.build_lines_data(co)
self.build_prev_op()
@ -186,27 +190,20 @@ class Scanner3(Scanner):
# turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
# 'LOAD_ASSERT' is used in assert statements.
self.load_asserts = set()
bs = list(bytecode)
n = len(bs)
for i in range(n):
inst = bs[i]
self.insts = list(bytecode)
n = len(self.insts)
for i, inst in enumerate(self.insts):
# We need to detect the difference between
# "raise AssertionError" and "assert"
# If we have a JUMP_FORWARD after the
# RAISE_VARARGS then we have a "raise" statement
# else we have an "assert" statement.
if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n:
next_inst = bs[i+1]
next_inst = self.insts[i+1]
if (next_inst.opname == 'LOAD_GLOBAL' and
next_inst.argval == 'AssertionError'):
for j in range(i+2, n):
raise_inst = bs[j]
if raise_inst.opname.startswith('RAISE_VARARGS'):
if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD':
self.load_asserts.add(next_inst.offset)
pass
break
if (i + 2 < n and self.insts[i+2].opname.startswith('RAISE_VARARGS')):
self.load_asserts.add(next_inst.offset)
pass
pass
@ -216,28 +213,15 @@ class Scanner3(Scanner):
# print("XXX2", jump_targets)
last_op_was_break = False
extended_arg = 0
for i, inst in enumerate(bytecode):
argval = inst.argval
op = inst.opcode
has_arg = op_has_argument(op, self.opc)
if has_arg:
if op == self.opc.EXTENDED_ARG:
extended_arg += self.extended_arg_val(argval)
# Normally we remove EXTENDED_ARG from the
# opcodes, but in the case of annotated functions
# can use the EXTENDED_ARG tuple to signal we have
# an annotated function.
if not bs[i+1].opname.startswith("MAKE_FUNCTION"):
continue
if isinstance(argval, int) and extended_arg:
min_extended= self.extended_arg_val(1)
if argval < min_extended:
argval += extended_arg
extended_arg = 0
if op == self.opc.EXTENDED_ARG:
# FIXME: The EXTENDED_ARG is used to signal annotation
# parameters
if self.insts[i+1].opcode != self.opc.MAKE_FUNCTION:
continue
if inst.offset in jump_targets:
jump_idx = 0
@ -256,9 +240,6 @@ class Scanner3(Scanner):
pass
elif inst.offset in self.except_targets:
come_from_name = 'COME_FROM_EXCEPT_CLAUSE'
if self.version <= 3.2:
continue
pass
tokens.append(Token(come_from_name,
None, repr(jump_offset),
offset='%s_%s' % (inst.offset, jump_idx),
@ -336,7 +317,7 @@ class Scanner3(Scanner):
offset = inst.offset,
linestart = inst.starts_line,
op = op,
has_arg = op_has_argument(op, op3),
has_arg = inst.has_arg,
opc = self.opc
)
)
@ -415,7 +396,7 @@ class Scanner3(Scanner):
offset = inst.offset,
linestart = inst.starts_line,
op = op,
has_arg = (op >= op3.HAVE_ARGUMENT),
has_arg = inst.has_arg,
opc = self.opc
)
)
@ -1063,9 +1044,9 @@ class Scanner3(Scanner):
op = self.code[i]
if op == self.opc.END_FINALLY:
if count_END_FINALLY == count_SETUP_:
assert self.code[self.prev_op[i]] in (JUMP_ABSOLUTE,
JUMP_FORWARD,
RETURN_VALUE)
assert self.code[self.prev_op[i]] in frozenset([self.opc.JUMP_ABSOLUTE,
self.opc.JUMP_FORWARD,
self.opc.RETURN_VALUE])
self.not_continue.add(self.prev_op[i])
return self.prev_op[i]
count_END_FINALLY += 1

View File

@ -13,11 +13,7 @@ from __future__ import print_function
from uncompyle6.scanners.scanner3 import Scanner3
from uncompyle6.scanner import Token, parse_fn_counts
from xdis.code import iscode
from xdis.bytecode import Bytecode
import xdis
from array import array
# bytecode verification, verify(), uses JUMP_OPS from here
from xdis.opcodes import opcode_36 as opc
@ -30,7 +26,7 @@ class Scanner36(Scanner3):
return
def ingest(self, co, classname=None, code_objects={}, show_asm=None):
tokens, customize = self.ingest_internal(co, classname, code_objects, show_asm)
tokens, customize = Scanner3.ingest(self, co, classname, code_objects, show_asm)
for t in tokens:
# The lowest bit of flags indicates whether the
# var-keyword argument is placed at the top of the stack
@ -46,269 +42,6 @@ class Scanner36(Scanner3):
pass
return tokens, customize
def ingest_internal(self, co, classname=None, code_objects={}, show_asm=None):
"""
Pick out tokens from an uncompyle6 code object, and transform them,
returning a list of uncompyle6 'Token's.
The transformations are made to assist the deparsing grammar.
Specificially:
- various types of LOAD_CONST's are categorized in terms of what they load
- COME_FROM instructions are added to assist parsing control structures
- MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
Also, when we encounter certain tokens, we add them to a set which will cause custom
grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
cause specific rules for the specific number of arguments they take.
"""
# FIXME: remove this when all subsidiary functions have been removed.
# We should be able to get everything from the self.insts list.
self.code = array('B', co.co_code)
show_asm = self.show_asm if not show_asm else show_asm
# show_asm = 'both'
if show_asm in ('both', 'before'):
bytecode = Bytecode(co, self.opc)
for instr in bytecode.get_instructions(co):
print(instr.disassemble())
# list of tokens/instructions
tokens = []
# "customize" is a dict whose keys are nonterminals
# and the value is the argument stack entries for that
# nonterminal. The count is a little hoaky. It is mostly
# not used, but sometimes it is.
customize = {}
if self.is_pypy:
customize['PyPy'] = 0
self.build_lines_data(co)
self.build_prev_op()
bytecode = Bytecode(co, self.opc)
# FIXME: put as its own method?
# Scan for assertions. Later we will
# turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
# 'LOAD_ASSERT' is used in assert statements.
self.load_asserts = set()
self.insts = list(bytecode)
n = len(self.insts)
for i, inst in enumerate(self.insts):
# We need to detect the difference between
# "raise AssertionError" and "assert"
# If we have a JUMP_FORWARD after the
# RAISE_VARARGS then we have a "raise" statement
# else we have an "assert" statement.
if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n:
next_inst = self.insts[i+1]
if (next_inst.opname == 'LOAD_GLOBAL' and
next_inst.argval == 'AssertionError'):
if (i + 2 < n and self.insts[i+2].opname.startswith('RAISE_VARARGS')):
self.load_asserts.add(next_inst.offset)
pass
pass
# Get jump targets
# Format: {target offset: [jump offsets]}
jump_targets = self.find_jump_targets(show_asm)
# print("XXX2", jump_targets)
last_op_was_break = False
for i, inst in enumerate(bytecode):
argval = inst.argval
op = inst.opcode
if op == self.opc.EXTENDED_ARG:
continue
if inst.offset in jump_targets:
jump_idx = 0
# We want to process COME_FROMs to the same offset to be in *descending*
# offset order so we have the larger range or biggest instruction interval
# last. (I think they are sorted in increasing order, but for safety
# we sort them). That way, specific COME_FROM tags will match up
# properly. For example, a "loop" with an "if" nested in it should have the
# "loop" tag last so the grammar rule matches that properly.
for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
come_from_name = 'COME_FROM'
opname = self.opname_for_offset(jump_offset)
if opname.startswith('SETUP_'):
come_from_type = opname[len('SETUP_'):]
come_from_name = 'COME_FROM_%s' % come_from_type
pass
elif inst.offset in self.except_targets:
come_from_name = 'COME_FROM_EXCEPT_CLAUSE'
tokens.append(Token(come_from_name,
None, repr(jump_offset),
offset='%s_%s' % (inst.offset, jump_idx),
has_arg = True, opc=self.opc))
jump_idx += 1
pass
pass
elif inst.offset in self.else_start:
end_offset = self.else_start[inst.offset]
tokens.append(Token('ELSE',
None, repr(end_offset),
offset='%s' % (inst.offset),
has_arg = True, opc=self.opc))
pass
pattr = inst.argrepr
opname = inst.opname
if opname in ['LOAD_CONST']:
const = argval
if iscode(const):
if const.co_name == '<lambda>':
opname = 'LOAD_LAMBDA'
elif const.co_name == '<genexpr>':
opname = 'LOAD_GENEXPR'
elif const.co_name == '<dictcomp>':
opname = 'LOAD_DICTCOMP'
elif const.co_name == '<setcomp>':
opname = 'LOAD_SETCOMP'
elif const.co_name == '<listcomp>':
opname = 'LOAD_LISTCOMP'
# verify() uses 'pattr' for comparison, since 'attr'
# now holds Code(const) and thus can not be used
# for comparison (todo: think about changing this)
# pattr = 'code_object @ 0x%x %s->%s' %\
# (id(const), const.co_filename, const.co_name)
pattr = '<code_object ' + const.co_name + '>'
else:
pattr = const
pass
elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'):
if self.version >= 3.6:
# 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION'
flags = argval
opname = 'MAKE_FUNCTION_%d' % (flags)
attr = []
for flag in self.MAKE_FUNCTION_FLAGS:
bit = flags & 1
if bit:
if pattr:
pattr += ", " + flag
else:
pattr += flag
attr.append(bit)
flags >>= 1
attr = attr[:4] # remove last value: attr[5] == False
else:
pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval)
pattr = ("%d positional, %d keyword pair, %d annotated" %
(pos_args, name_pair_args, annotate_args))
if name_pair_args > 0:
opname = '%s_N%d' % (opname, name_pair_args)
pass
if annotate_args > 0:
opname = '%s_A_%d' % (opname, annotate_args)
pass
opname = '%s_%d' % (opname, pos_args)
attr = (pos_args, name_pair_args, annotate_args)
tokens.append(
Token(
opname = opname,
attr = attr,
pattr = pattr,
offset = inst.offset,
linestart = inst.starts_line,
op = op,
has_arg = inst.has_arg,
opc = self.opc
)
)
continue
elif op in self.varargs_ops:
pos_args = argval
if self.is_pypy and not pos_args and opname == 'BUILD_MAP':
opname = 'BUILD_MAP_n'
else:
opname = '%s_%d' % (opname, pos_args)
elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'):
# The value in the dict is in special cases in semantic actions, such
# as CALL_FUNCTION. The value is not used in these cases, so we put
# in arbitrary value 0.
customize[opname] = 0
elif opname == 'UNPACK_EX':
# FIXME: try with scanner and parser by
# changing argval
before_args = argval & 0xFF
after_args = (argval >> 8) & 0xff
pattr = "%d before vararg, %d after" % (before_args, after_args)
argval = (before_args, after_args)
opname = '%s_%d+%d' % (opname, before_args, after_args)
elif op == self.opc.JUMP_ABSOLUTE:
# Further classify JUMP_ABSOLUTE into backward jumps
# which are used in loops, and "CONTINUE" jumps which
# may appear in a "continue" statement. The loop-type
# and continue-type jumps will help us classify loop
# boundaries The continue-type jumps help us get
# "continue" statements with would otherwise be turned
# into a "pass" statement because JUMPs are sometimes
# ignored in rules as just boundary overhead. In
# comprehensions we might sometimes classify JUMP_BACK
# as CONTINUE, but that's okay since we add a grammar
# rule for that.
pattr = argval
# FIXME: 0 isn't always correct
target = self.get_target(inst.offset, 0)
if target <= inst.offset:
next_opname = self.opname[self.code[inst.offset+3]]
if (inst.offset in self.stmts and
(self.version != 3.0 or (hasattr(inst, 'linestart'))) and
(next_opname not in ('END_FINALLY', 'POP_BLOCK',
# Python 3.0 only uses POP_TOP
'POP_TOP'))):
opname = 'CONTINUE'
else:
opname = 'JUMP_BACK'
# FIXME: this is a hack to catch stuff like:
# if x: continue
# the "continue" is not on a new line.
# There are other situations where we don't catch
# CONTINUE as well.
if tokens[-1].kind == 'JUMP_BACK' and tokens[-1].attr <= argval:
if tokens[-2].kind == 'BREAK_LOOP':
del tokens[-1]
else:
# intern is used because we are changing the *previous* token
tokens[-1].kind = intern('CONTINUE')
if last_op_was_break and opname == 'CONTINUE':
last_op_was_break = False
continue
elif op == self.opc.RETURN_VALUE:
if inst.offset in self.return_end_ifs:
opname = 'RETURN_END_IF'
elif inst.offset in self.load_asserts:
opname = 'LOAD_ASSERT'
last_op_was_break = opname == 'BREAK_LOOP'
tokens.append(
Token(
opname = opname,
attr = argval,
pattr = pattr,
offset = inst.offset,
linestart = inst.starts_line,
op = op,
has_arg = inst.has_arg,
opc = self.opc
)
)
pass
if show_asm in ('both', 'after'):
for t in tokens:
print(t)
print()
return tokens, customize
def find_jump_targets(self, debug):
"""
Detect all offsets in a byte code which are jump targets