Move refactored ingest from 3.6 to 3.x...

We are getting away from working with bytecode in favor of working with full-fledged structured instructions Up next: find_jump_targets()
2024-10-07 10:13:39 +00:00 · 2017-11-06 09:43:49 -05:00 · 2017-11-06 09:43:49 -05:00 · 124267849c
commit 124267849c
parent 6bffae91fa
3 changed files with 27 additions and 313 deletions
--- a/test/Makefile
+++ b/test/Makefile
@ -50,8 +50,8 @@ check-3.6: check-bytecode
 	$(PYTHON) test_pythonlib.py --bytecode-3.6 --weak-verify $(COMPILE)

 # FIXME
-#: this is called when running under pypy3.5-5.8.0
-5.8:
+#: this is called when running under pypy3.5-5.8.0 or pypy2-5.6.0
+5.8 5.6:

 #: Check deparsing only, but from a different Python version
 check-disasm:
@ -71,7 +71,7 @@ check-bytecode-2:
 check-bytecode-3:
 	$(PYTHON) test_pythonlib.py --bytecode-3.0 \
       --bytecode-3.1 --bytecode-3.2 --bytecode-3.3 \
-       --bytecode-3.4 --bytecode-3.5 --bytecode-pypy3.2
+       --bytecode-3.4 --bytecode-3.5 --bytecode-3.6 --bytecode-pypy3.2

 #: Check deparsing bytecode that works running Python 2 and Python 3
 check-bytecode: check-bytecode-3
--- a/uncompyle6/scanners/scanner3.py
+++ b/uncompyle6/scanners/scanner3.py
@ -27,7 +27,7 @@ from array import array

 from uncompyle6.scanner import Scanner
 from xdis.code import iscode
-from xdis.bytecode import Bytecode, op_has_argument, instruction_size
+from xdis.bytecode import Bytecode, instruction_size
 from xdis.util import code2num

 from uncompyle6.scanner import Token, parse_fn_counts
@ -144,19 +144,24 @@ class Scanner3(Scanner):
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
-        returning a list of uncompyle6 'Token's.
+        returning a list of uncompyle6 Token's.

        The transformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
+           -  some EXTENDED_ARGS instructions are removed

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

+        # FIXME: remove this when all subsidiary functions have been removed.
+        # We should be able to get everything from the self.insts list.
+        self.code = array('B', co.co_code)
+
        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'both'
        if show_asm in ('both', 'before'):
@ -175,7 +180,6 @@ class Scanner3(Scanner):
        if self.is_pypy:
            customize['PyPy'] = 0

-        self.code = array('B', co.co_code)
        self.build_lines_data(co)
        self.build_prev_op()

@ -186,27 +190,20 @@ class Scanner3(Scanner):
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
-        bs = list(bytecode)
-        n = len(bs)
-        for i in range(n):
-            inst = bs[i]
-
+        self.insts = list(bytecode)
+        n = len(self.insts)
+        for i, inst in enumerate(self.insts):
            # We need to detect the difference between
            # "raise AssertionError" and "assert"
            # If we have a JUMP_FORWARD after the
            # RAISE_VARARGS then we have a "raise" statement
            # else we have an "assert" statement.
            if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n:
-                next_inst = bs[i+1]
+                next_inst = self.insts[i+1]
                if (next_inst.opname == 'LOAD_GLOBAL' and
                    next_inst.argval == 'AssertionError'):
-                    for j in range(i+2, n):
-                        raise_inst = bs[j]
-                        if raise_inst.opname.startswith('RAISE_VARARGS'):
-                            if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD':
-                                self.load_asserts.add(next_inst.offset)
-                                pass
-                            break
+                    if (i + 2 < n and self.insts[i+2].opname.startswith('RAISE_VARARGS')):
+                        self.load_asserts.add(next_inst.offset)
                    pass
                pass

@ -216,28 +213,15 @@ class Scanner3(Scanner):
        # print("XXX2", jump_targets)
        last_op_was_break = False

-        extended_arg = 0
        for i, inst in enumerate(bytecode):

            argval = inst.argval
            op     = inst.opcode
-            has_arg = op_has_argument(op, self.opc)
-            if has_arg:
-                if op == self.opc.EXTENDED_ARG:
-                    extended_arg += self.extended_arg_val(argval)
-
-                    # Normally we remove EXTENDED_ARG from the
-                    # opcodes, but in the case of annotated functions
-                    # can use the EXTENDED_ARG tuple to signal we have
-                    # an annotated function.
-                    if not bs[i+1].opname.startswith("MAKE_FUNCTION"):
-                        continue
-
-            if isinstance(argval, int) and extended_arg:
-                min_extended= self.extended_arg_val(1)
-                if argval < min_extended:
-                    argval += extended_arg
-            extended_arg = 0
+            if op == self.opc.EXTENDED_ARG:
+                # FIXME: The EXTENDED_ARG is used to signal annotation
+                # parameters
+                if self.insts[i+1].opcode != self.opc.MAKE_FUNCTION:
+                    continue

            if inst.offset in jump_targets:
                jump_idx = 0
@ -256,9 +240,6 @@ class Scanner3(Scanner):
                        pass
                    elif inst.offset in self.except_targets:
                        come_from_name = 'COME_FROM_EXCEPT_CLAUSE'
-                        if self.version <= 3.2:
-                            continue
-                        pass
                    tokens.append(Token(come_from_name,
                                        None, repr(jump_offset),
                                        offset='%s_%s' % (inst.offset, jump_idx),
@ -336,7 +317,7 @@ class Scanner3(Scanner):
                        offset = inst.offset,
                        linestart = inst.starts_line,
                        op = op,
-                        has_arg = op_has_argument(op, op3),
+                        has_arg = inst.has_arg,
                        opc = self.opc
                    )
                )
@ -415,7 +396,7 @@ class Scanner3(Scanner):
                    offset = inst.offset,
                    linestart = inst.starts_line,
                    op = op,
-                    has_arg = (op >= op3.HAVE_ARGUMENT),
+                    has_arg = inst.has_arg,
                    opc = self.opc
                    )
                )
@ -1063,9 +1044,9 @@ class Scanner3(Scanner):
            op = self.code[i]
            if op == self.opc.END_FINALLY:
                if count_END_FINALLY == count_SETUP_:
-                    assert self.code[self.prev_op[i]] in (JUMP_ABSOLUTE,
-                                                          JUMP_FORWARD,
-                                                          RETURN_VALUE)
+                    assert self.code[self.prev_op[i]] in frozenset([self.opc.JUMP_ABSOLUTE,
+                                                                    self.opc.JUMP_FORWARD,
+                                                                    self.opc.RETURN_VALUE])
                    self.not_continue.add(self.prev_op[i])
                    return self.prev_op[i]
                count_END_FINALLY += 1
--- a/uncompyle6/scanners/scanner36.py
+++ b/uncompyle6/scanners/scanner36.py
@ -13,11 +13,7 @@ from __future__ import print_function

 from uncompyle6.scanners.scanner3 import Scanner3

-from uncompyle6.scanner import Token, parse_fn_counts
-from xdis.code import iscode
-from xdis.bytecode import Bytecode
 import xdis
-from array import array

 # bytecode verification, verify(), uses JUMP_OPS from here
 from xdis.opcodes import opcode_36 as opc
@ -30,7 +26,7 @@ class Scanner36(Scanner3):
        return

    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
-        tokens, customize = self.ingest_internal(co, classname, code_objects, show_asm)
+        tokens, customize = Scanner3.ingest(self, co, classname, code_objects, show_asm)
        for t in tokens:
            # The lowest bit of flags indicates whether the
            # var-keyword argument is placed at the top of the stack
@ -46,269 +42,6 @@ class Scanner36(Scanner3):
            pass
        return tokens, customize

-    def ingest_internal(self, co, classname=None, code_objects={}, show_asm=None):
-        """
-        Pick out tokens from an uncompyle6 code object, and transform them,
-        returning a list of uncompyle6 'Token's.
-
-        The transformations are made to assist the deparsing grammar.
-        Specificially:
-           -  various types of LOAD_CONST's are categorized in terms of what they load
-           -  COME_FROM instructions are added to assist parsing control structures
-           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
-
-        Also, when we encounter certain tokens, we add them to a set which will cause custom
-        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
-        cause specific rules for the specific number of arguments they take.
-        """
-
-        # FIXME: remove this when all subsidiary functions have been removed.
-        # We should be able to get everything from the self.insts list.
-        self.code = array('B', co.co_code)
-
-        show_asm = self.show_asm if not show_asm else show_asm
-        # show_asm = 'both'
-        if show_asm in ('both', 'before'):
-            bytecode = Bytecode(co, self.opc)
-            for instr in bytecode.get_instructions(co):
-                print(instr.disassemble())
-
-        # list of tokens/instructions
-        tokens = []
-
-        # "customize" is a dict whose keys are nonterminals
-        # and the value is the argument stack entries for that
-        # nonterminal. The count is a little hoaky. It is mostly
-        # not used, but sometimes it is.
-        customize = {}
-        if self.is_pypy:
-            customize['PyPy'] = 0
-
-        self.build_lines_data(co)
-        self.build_prev_op()
-
-        bytecode = Bytecode(co, self.opc)
-
-        # FIXME: put as its own method?
-        # Scan for assertions. Later we will
-        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
-        # 'LOAD_ASSERT' is used in assert statements.
-        self.load_asserts = set()
-        self.insts = list(bytecode)
-        n = len(self.insts)
-        for i, inst in enumerate(self.insts):
-            # We need to detect the difference between
-            # "raise AssertionError" and "assert"
-            # If we have a JUMP_FORWARD after the
-            # RAISE_VARARGS then we have a "raise" statement
-            # else we have an "assert" statement.
-            if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n:
-                next_inst = self.insts[i+1]
-                if (next_inst.opname == 'LOAD_GLOBAL' and
-                    next_inst.argval == 'AssertionError'):
-                    if (i + 2 < n and self.insts[i+2].opname.startswith('RAISE_VARARGS')):
-                        self.load_asserts.add(next_inst.offset)
-                    pass
-                pass
-
-        # Get jump targets
-        # Format: {target offset: [jump offsets]}
-        jump_targets = self.find_jump_targets(show_asm)
-        # print("XXX2", jump_targets)
-        last_op_was_break = False
-
-        for i, inst in enumerate(bytecode):
-
-            argval = inst.argval
-            op     = inst.opcode
-            if op == self.opc.EXTENDED_ARG:
-                continue
-
-            if inst.offset in jump_targets:
-                jump_idx = 0
-                # We want to process COME_FROMs to the same offset to be in *descending*
-                # offset order so we have the larger range or biggest instruction interval
-                # last. (I think they are sorted in increasing order, but for safety
-                # we sort them). That way, specific COME_FROM tags will match up
-                # properly. For example, a "loop" with an "if" nested in it should have the
-                # "loop" tag last so the grammar rule matches that properly.
-                for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
-                    come_from_name = 'COME_FROM'
-                    opname = self.opname_for_offset(jump_offset)
-                    if opname.startswith('SETUP_'):
-                        come_from_type = opname[len('SETUP_'):]
-                        come_from_name = 'COME_FROM_%s' % come_from_type
-                        pass
-                    elif inst.offset in self.except_targets:
-                        come_from_name = 'COME_FROM_EXCEPT_CLAUSE'
-                    tokens.append(Token(come_from_name,
-                                        None, repr(jump_offset),
-                                        offset='%s_%s' % (inst.offset, jump_idx),
-                                        has_arg = True, opc=self.opc))
-                    jump_idx += 1
-                    pass
-                pass
-            elif inst.offset in self.else_start:
-                end_offset = self.else_start[inst.offset]
-                tokens.append(Token('ELSE',
-                                    None, repr(end_offset),
-                                    offset='%s' % (inst.offset),
-                                    has_arg = True, opc=self.opc))
-
-                pass
-
-            pattr  = inst.argrepr
-            opname = inst.opname
-
-            if opname in ['LOAD_CONST']:
-                const = argval
-                if iscode(const):
-                    if const.co_name == '<lambda>':
-                        opname = 'LOAD_LAMBDA'
-                    elif const.co_name == '<genexpr>':
-                        opname = 'LOAD_GENEXPR'
-                    elif const.co_name == '<dictcomp>':
-                        opname = 'LOAD_DICTCOMP'
-                    elif const.co_name == '<setcomp>':
-                        opname = 'LOAD_SETCOMP'
-                    elif const.co_name == '<listcomp>':
-                        opname = 'LOAD_LISTCOMP'
-                    # verify() uses 'pattr' for comparison, since 'attr'
-                    # now holds Code(const) and thus can not be used
-                    # for comparison (todo: think about changing this)
-                    # pattr = 'code_object @ 0x%x %s->%s' %\
-                    # (id(const), const.co_filename, const.co_name)
-                    pattr = '<code_object ' + const.co_name + '>'
-                else:
-                    pattr = const
-                    pass
-            elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'):
-                if self.version >= 3.6:
-                    # 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION'
-                    flags = argval
-                    opname = 'MAKE_FUNCTION_%d' % (flags)
-                    attr = []
-                    for flag in self.MAKE_FUNCTION_FLAGS:
-                        bit = flags & 1
-                        if bit:
-                            if pattr:
-                                pattr += ", " + flag
-                            else:
-                                pattr += flag
-                        attr.append(bit)
-                        flags >>= 1
-                    attr = attr[:4] # remove last value: attr[5] == False
-                else:
-                    pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval)
-                    pattr = ("%d positional, %d keyword pair, %d annotated" %
-                                 (pos_args, name_pair_args, annotate_args))
-                    if name_pair_args > 0:
-                        opname = '%s_N%d' % (opname, name_pair_args)
-                        pass
-                    if annotate_args > 0:
-                        opname = '%s_A_%d' % (opname, annotate_args)
-                        pass
-                    opname = '%s_%d' % (opname, pos_args)
-                    attr = (pos_args, name_pair_args, annotate_args)
-                tokens.append(
-                    Token(
-                        opname = opname,
-                        attr = attr,
-                        pattr = pattr,
-                        offset = inst.offset,
-                        linestart = inst.starts_line,
-                        op = op,
-                        has_arg = inst.has_arg,
-                        opc = self.opc
-                    )
-                )
-                continue
-            elif op in self.varargs_ops:
-                pos_args = argval
-                if self.is_pypy and not pos_args and opname == 'BUILD_MAP':
-                    opname = 'BUILD_MAP_n'
-                else:
-                    opname = '%s_%d' % (opname, pos_args)
-            elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'):
-                # The value in the dict is in special cases in semantic actions, such
-                # as CALL_FUNCTION. The value is not used in these cases, so we put
-                # in arbitrary value 0.
-                customize[opname] = 0
-            elif opname == 'UNPACK_EX':
-                # FIXME: try with scanner and parser by
-                # changing argval
-                before_args = argval & 0xFF
-                after_args = (argval >> 8) & 0xff
-                pattr = "%d before vararg, %d after" % (before_args, after_args)
-                argval = (before_args, after_args)
-                opname = '%s_%d+%d' % (opname, before_args, after_args)
-
-            elif op == self.opc.JUMP_ABSOLUTE:
-                # Further classify JUMP_ABSOLUTE into backward jumps
-                # which are used in loops, and "CONTINUE" jumps which
-                # may appear in a "continue" statement.  The loop-type
-                # and continue-type jumps will help us classify loop
-                # boundaries The continue-type jumps help us get
-                # "continue" statements with would otherwise be turned
-                # into a "pass" statement because JUMPs are sometimes
-                # ignored in rules as just boundary overhead. In
-                # comprehensions we might sometimes classify JUMP_BACK
-                # as CONTINUE, but that's okay since we add a grammar
-                # rule for that.
-                pattr = argval
-                # FIXME: 0 isn't always correct
-                target = self.get_target(inst.offset, 0)
-                if target <= inst.offset:
-                    next_opname = self.opname[self.code[inst.offset+3]]
-                    if (inst.offset in self.stmts and
-                        (self.version != 3.0 or (hasattr(inst, 'linestart'))) and
-                        (next_opname not in ('END_FINALLY', 'POP_BLOCK',
-                                            # Python 3.0 only uses POP_TOP
-                                            'POP_TOP'))):
-                        opname = 'CONTINUE'
-                    else:
-                        opname = 'JUMP_BACK'
-                        # FIXME: this is a hack to catch stuff like:
-                        #   if x: continue
-                        # the "continue" is not on a new line.
-                        # There are other situations where we don't catch
-                        # CONTINUE as well.
-                        if tokens[-1].kind == 'JUMP_BACK' and tokens[-1].attr <= argval:
-                            if tokens[-2].kind == 'BREAK_LOOP':
-                                del tokens[-1]
-                            else:
-                                # intern is used because we are changing the *previous* token
-                                tokens[-1].kind = intern('CONTINUE')
-                    if last_op_was_break and opname == 'CONTINUE':
-                        last_op_was_break = False
-                        continue
-            elif op == self.opc.RETURN_VALUE:
-                if inst.offset in self.return_end_ifs:
-                    opname = 'RETURN_END_IF'
-            elif inst.offset in self.load_asserts:
-                opname = 'LOAD_ASSERT'
-
-            last_op_was_break = opname == 'BREAK_LOOP'
-            tokens.append(
-                Token(
-                    opname = opname,
-                    attr = argval,
-                    pattr = pattr,
-                    offset = inst.offset,
-                    linestart = inst.starts_line,
-                    op = op,
-                    has_arg = inst.has_arg,
-                    opc = self.opc
-                    )
-                )
-            pass
-
-        if show_asm in ('both', 'after'):
-            for t in tokens:
-                print(t)
-            print()
-        return tokens, customize
-
    def find_jump_targets(self, debug):
        """
        Detect all offsets in a byte code which are jump targets