Handle long 2.x bytecode literals more efficiently

2024-11-27 07:00:42 +00:00 · 2022-04-27 13:47:56 -04:00 · 2022-04-27 13:47:56 -04:00 · 8e5faa933f
commit 8e5faa933f
parent cfd6166d8d
7 changed files with 153 additions and 15 deletions
--- a/test/bytecode_2.7_run/05_long_literals.pyc
+++ b/test/bytecode_2.7_run/05_long_literals.pyc
--- a/uncompyle6/parsers/parse2.py
+++ b/uncompyle6/parsers/parse2.py
@ -312,6 +312,14 @@ class Python2Parser(PythonParser):

            opname_base = opname[: opname.rfind("_")]

+            if opname in ("BUILD_CONST_LIST", "BUILD_CONST_SET"):
+                rule = """
+                       add_consts          ::= ADD_VALUE*
+                       const_list          ::= COLLECTION_START add_consts %s
+                       expr                ::= const_list
+                       """ % opname
+                self.addRule(rule, nop_func)
+
            # The order of opname listed is roughly sorted below
            if opname_base in ("BUILD_LIST", "BUILD_SET", "BUILD_TUPLE"):
                # We do this complicated test to speed up parsing of
--- a/uncompyle6/parsers/parse3.py
+++ b/uncompyle6/parsers/parse3.py
@ -748,18 +748,37 @@ class Python3Parser(PythonParser):
                kvlist_n = "expr " * (token.attr)
                rule = "dict ::= %sLOAD_CONST %s" % (kvlist_n, opname)
                self.addRule(rule, nop_func)
+
+            elif opname in ("BUILD_CONST_LIST", "BUILD_CONST_DICT", "BUILD_CONST_SET"):
+                if opname == "BUILD_CONST_DICT":
+                    rule = """
+                           add_consts          ::= ADD_VALUE*
+                           const_list          ::= COLLECTION_START add_consts %s
+                           dict                ::= const_list
+                           expr                ::= dict
+                           """ % opname
+                else:
+                    rule = """
+                           add_consts          ::= ADD_VALUE*
+                           const_list          ::= COLLECTION_START add_consts %s
+                           expr                ::= const_list
+                           """ % opname
+                self.addRule(rule, nop_func)
+
            elif opname.startswith("BUILD_DICT_OLDER"):
                rule = """dict ::= COLLECTION_START key_value_pairs BUILD_DICT_OLDER
                          key_value_pairs ::= key_value_pair+
                          key_value_pair  ::= ADD_KEY ADD_VALUE
                       """
                self.addRule(rule, nop_func)
+
            elif opname.startswith("BUILD_LIST_UNPACK"):
                v = token.attr
                rule = "build_list_unpack ::= %s%s" % ("expr " * v, opname)
                self.addRule(rule, nop_func)
                rule = "expr ::= build_list_unpack"
                self.addRule(rule, nop_func)
+
            elif opname_base in ("BUILD_MAP", "BUILD_MAP_UNPACK"):
                kvlist_n = "kvlist_%s" % token.attr
                if opname == "BUILD_MAP_n":
--- a/uncompyle6/scanner.py
+++ b/uncompyle6/scanner.py
@ -1,4 +1,4 @@
-#  Copyright (c) 2016, 2018-2021 by Rocky Bernstein
+#  Copyright (c) 2016, 2018-2022 by Rocky Bernstein
 #  Copyright (c) 2005 by Dan Pascu <dan@windowmaker.org>
 #  Copyright (c) 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
 #  Copyright (c) 1999 John Aycock
@ -24,7 +24,6 @@ scanners, e.g. for Python 2.7 or 3.4.
 from typing import Optional
 from array import array
 from collections import namedtuple
-from sys import intern  # noqa

 from uncompyle6.scanners.tok import Token
 from xdis.version_info import IS_PYPY, version_tuple_to_str
@ -125,6 +124,80 @@ class Scanner(object):
        # FIXME: This weird Python2 behavior is not Python3
        self.resetTokenClass()

+    def bound_collection_from_tokens(
+        self, tokens, t, i, collection_type
+    ):
+        count = t.attr
+        assert isinstance(count, int)
+
+        assert count <= i
+
+        if collection_type == "CONST_DICT":
+            # constant dictonaries work via BUILD_CONST_KEY_MAP and
+            # handle the values() like sets and lists.
+            # However the keys() are an LOAD_CONST of the keys.
+            # adjust offset to account for this
+            count += 1
+
+        # For small lists don't bother
+        if count < 5:
+            return None
+
+        collection_start = i - count
+
+        for j in range(collection_start, i):
+            if tokens[j].kind not in (
+                "LOAD_CONST",
+                "LOAD_FAST",
+                "LOAD_GLOBAL",
+                "LOAD_NAME",
+            ):
+                return None
+
+        collection_enum = CONST_COLLECTIONS.index(collection_type)
+
+        # If we go there all instructions before tokens[i] are LOAD_CONST and we can replace
+        # add a boundary marker and change LOAD_CONST to something else
+        new_tokens = tokens[:-count]
+        start_offset = tokens[collection_start].offset
+        new_tokens.append(
+            Token(
+                opname="COLLECTION_START",
+                attr=collection_enum,
+                pattr=collection_type,
+                offset="%s_0" % start_offset,
+                has_arg=True,
+                opc=self.opc,
+                has_extended_arg=False,
+            )
+        )
+        for j in range(collection_start, i):
+            new_tokens.append(
+                Token(
+                    opname="ADD_VALUE",
+                    attr=tokens[j].attr,
+                    pattr=tokens[j].pattr,
+                    offset=tokens[j].offset,
+                    has_arg=True,
+                    linestart=tokens[j].linestart,
+                    opc=self.opc,
+                    has_extended_arg=False,
+                )
+            )
+        new_tokens.append(
+            Token(
+                opname="BUILD_%s" % collection_type,
+                attr=t.attr,
+                pattr=t.pattr,
+                offset=t.offset,
+                has_arg=t.has_arg,
+                linestart=t.linestart,
+                opc=t.opc,
+                has_extended_arg=False,
+            )
+        )
+        return new_tokens
+
    def build_instructions(self, co):
        """
        Create a list of instructions (a structured object rather than
--- a/uncompyle6/scanners/scanner2.py
+++ b/uncompyle6/scanners/scanner2.py
@ -200,7 +200,6 @@ class Scanner2(Scanner):
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """
-
        if not show_asm:
            show_asm = self.show_asm

@ -212,7 +211,7 @@ class Scanner2(Scanner):
                print(instr.disassemble())

        # list of tokens/instructions
-        tokens = []
+        new_tokens = []

        # "customize" is in the process of going away here
        customize = {}
@ -289,7 +288,7 @@ class Scanner2(Scanner):
                        if come_from_type not in ("LOOP", "EXCEPT"):
                            come_from_name = "COME_FROM_%s" % come_from_type
                        pass
-                    tokens.append(
+                    new_tokens.append(
                        Token(
                            come_from_name,
                            jump_offset,
@ -313,6 +312,24 @@ class Scanner2(Scanner):
                if op == self.opc.EXTENDED_ARG:
                    extended_arg += self.extended_arg_val(oparg)
                    continue
+
+                # Note: name used to match on rather than op since
+                # BUILD_SET isn't in earlier Pythons.
+                if op_name in (
+                    "BUILD_LIST",
+                    "BUILD_SET",
+                ):
+                    t = Token(
+                        op_name, oparg, pattr, offset, self.linestarts.get(offset, None), op, has_arg, self.opc
+                    )
+                    collection_type = op_name.split("_")[1]
+                    next_tokens = self.bound_collection_from_tokens(
+                        new_tokens, t, len(new_tokens), "CONST_%s" % collection_type
+                    )
+                    if next_tokens is not None:
+                        new_tokens = next_tokens
+                        continue
+
                if op in self.opc.CONST_OPS:
                    const = co.co_consts[oparg]
                    if iscode(const):
@ -347,12 +364,12 @@ class Scanner2(Scanner):
                elif op in self.opc.JREL_OPS:
                    #  use instead: hasattr(self, 'patch_continue'): ?
                    if self.version[:2] == (2, 7):
-                        self.patch_continue(tokens, offset, op)
+                        self.patch_continue(new_tokens, offset, op)
                    pattr = repr(offset + 3 + oparg)
                elif op in self.opc.JABS_OPS:
                    # use instead: hasattr(self, 'patch_continue'): ?
                    if self.version[:2] == (2, 7):
-                        self.patch_continue(tokens, offset, op)
+                        self.patch_continue(new_tokens, offset, op)
                    pattr = repr(oparg)
                elif op in self.opc.LOCAL_OPS:
                    pattr = varnames[oparg]
@ -433,13 +450,13 @@ class Scanner2(Scanner):
            linestart = self.linestarts.get(offset, None)

            if offset not in replace:
-                tokens.append(
+                new_tokens.append(
                    Token(
                        op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc
                    )
                )
            else:
-                tokens.append(
+                new_tokens.append(
                    Token(
                        replace[offset],
                        oparg,
@ -455,10 +472,10 @@ class Scanner2(Scanner):
            pass

        if show_asm in ("both", "after"):
-            for t in tokens:
+            for t in new_tokens:
                print(t.format(line_prefix=""))
            print()
-        return tokens, customize
+        return new_tokens, customize

    def build_statement_indices(self):
        code = self.code
--- a/uncompyle6/scanners/scanner26.py
+++ b/uncompyle6/scanners/scanner26.py
@ -123,7 +123,9 @@ class Scanner26(scan.Scanner2):
            i = self.next_stmt[i]

        extended_arg = 0
+        i = -1
        for offset in self.op_range(0, codelen):
+            i += 1
            op = self.code[offset]
            op_name = self.opname[op]
            oparg = None; pattr = None
@ -156,8 +158,28 @@ class Scanner26(scan.Scanner2):
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
-                    extended_arg = oparg * L65536
-                    continue
+                     extended_arg += self.extended_arg_val(oparg)
+                     continue
+
+
+                # Note: name used to match on rather than op since
+                # BUILD_SET isn't in earlier Pythons.
+                if op_name in (
+                    "BUILD_LIST",
+                    "BUILD_SET",
+                ):
+                    t = Token(
+                        op_name, oparg, pattr, offset, self.linestarts.get(offset, None), op, has_arg, self.opc
+                    )
+
+                    collection_type = op_name.split("_")[1]
+                    next_tokens = self.bound_collection_from_tokens(
+                        tokens, t, i, "CONST_%s" % collection_type
+                    )
+                    if next_tokens is not None:
+                        tokens = next_tokens
+                        continue
+
                if op in self.opc.CONST_OPS:
                    const = co.co_consts[oparg]
                    # We can't use inspect.iscode() because we may be
--- a/uncompyle6/scanners/scanner37.py
+++ b/uncompyle6/scanners/scanner37.py
@ -24,8 +24,7 @@ scanner routine for Python 3.

 from typing import Tuple

-from uncompyle6.scanner import CONST_COLLECTIONS
-from uncompyle6.scanners.tok import Token
+from uncompyle6.scanner import CONST_COLLECTIONS, Token
 from uncompyle6.scanners.scanner37base import Scanner37Base

 # bytecode verification, verify(), uses JUMP_OPs from here