python-uncompyle6/uncompyle6/verify.py
2023-12-17 10:52:32 -05:00

547 lines
19 KiB
Python
Executable File

#
# (C) Copyright 2015-2018, 2020-2021, 2023 by Rocky Bernstein
# (C) Copyright 2000-2002 by hartmut Goebel <h.goebel@crazy-compilers.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
byte-code verification
"""
from __future__ import print_function
import operator, sys
import xdis.std as dis
from subprocess import call
import uncompyle6
from uncompyle6.scanner import Token as ScannerToken, get_scanner
from xdis import iscode, load_file, load_module, pretty_code_flags, PYTHON_MAGIC_INT
truediv = operator.truediv
from functools import reduce
def code_equal(a, b):
return a.co_code == b.co_code
BIN_OP_FUNCS = {
"BINARY_POWER": operator.pow,
"BINARY_MULTIPLY": operator.mul,
"BINARY_DIVIDE": truediv,
"BINARY_FLOOR_DIVIDE": operator.floordiv,
"BINARY_TRUE_DIVIDE": operator.truediv,
"BINARY_MODULO": operator.mod,
"BINARY_ADD": operator.add,
"BINARY_SUBRACT": operator.sub,
"BINARY_LSHIFT": operator.lshift,
"BINARY_RSHIFT": operator.rshift,
"BINARY_AND": operator.and_,
"BINARY_XOR": operator.xor,
"BINARY_OR": operator.or_,
}
JUMP_OPS = None
# --- exceptions ---
class VerifyCmpError(Exception):
pass
class CmpErrorConsts(VerifyCmpError):
"""Exception to be raised when consts differ."""
def __init__(self, name, index):
self.name = name
self.index = index
def __str__(self):
return "Compare Error within Consts of %s at index %i" % (
repr(self.name),
self.index,
)
class CmpErrorConstsType(VerifyCmpError):
"""Exception to be raised when consts differ."""
def __init__(self, name, index):
self.name = name
self.index = index
def __str__(self):
return "Consts type differ in %s at index %i" % (repr(self.name), self.index)
class CmpErrorConstsLen(VerifyCmpError):
"""Exception to be raised when length of co_consts differs."""
def __init__(self, name, consts1, consts2):
self.name = name
self.consts = (consts1, consts2)
def __str__(self):
return "Consts length differs in %s:\n\n%i:\t%s\n\n%i:\t%s\n\n" % (
repr(self.name),
len(self.consts[0]),
repr(self.consts[0]),
len(self.consts[1]),
repr(self.consts[1]),
)
class CmpErrorCode(VerifyCmpError):
"""Exception to be raised when code differs."""
def __init__(self, name, index, token1, token2, tokens1, tokens2):
self.name = name
self.index = index
self.token1 = token1
self.token2 = token2
self.tokens = [tokens1, tokens2]
def __str__(self):
s = reduce(
lambda s, t: "%s%-37s\t%-37s\n" % (s, t[0], t[1]),
list(map(lambda a, b: (a, b), self.tokens[0], self.tokens[1])),
"Code differs in %s\n" % str(self.name),
)
return (
"Code differs in %s at offset %s [%s] != [%s]\n\n"
% (repr(self.name), self.index, repr(self.token1), repr(self.token2))
) + s
class CmpErrorCodeLen(VerifyCmpError):
"""Exception to be raised when code length differs."""
def __init__(self, name, tokens1, tokens2):
self.name = name
self.tokens = [tokens1, tokens2]
def __str__(self):
return reduce(
lambda s, t: "%s%-37s\t%-37s\n" % (s, t[0], t[1]),
list(map(lambda a, b: (a, b), self.tokens[0], self.tokens[1])),
"Code len differs in %s\n" % str(self.name),
)
class CmpErrorMember(VerifyCmpError):
"""Exception to be raised when other members differ."""
def __init__(self, name, member, data1, data2):
self.name = name
self.member = member
self.data = (data1, data2)
def __str__(self):
return "Member %s differs in %s:\n\t%s\n\t%s\n" % (
repr(self.member),
repr(self.name),
repr(self.data[0]),
repr(self.data[1]),
)
# --- compare ---
# these members are ignored
__IGNORE_CODE_MEMBERS__ = [
"co_filename",
"co_firstlineno",
"co_lnotab",
"co_stacksize",
"co_names",
]
def cmp_code_objects(version, is_pypy, code_obj1, code_obj2, verify, name=""):
"""
Compare two code-objects.
This is the main part of this module.
"""
# print code_obj1, type(code_obj2)
assert iscode(
code_obj1
), "cmp_code_object first object type is %s, not code" % type(code_obj1)
assert iscode(
code_obj2
), "cmp_code_object second object type is %s, not code" % type(code_obj2)
# print dir(code_obj1)
if isinstance(code_obj1, object):
# new style classes (Python 2.2)
# assume _both_ code objects to be new style classes
assert dir(code_obj1) == dir(code_obj2)
else:
# old style classes
assert dir(code_obj1) == code_obj1.__members__
assert dir(code_obj2) == code_obj2.__members__
assert code_obj1.__members__ == code_obj2.__members__
if name == "__main__":
name = code_obj1.co_name
else:
name = "%s.%s" % (name, code_obj1.co_name)
if name == ".?":
name = "__main__"
if isinstance(code_obj1, object) and code_equal(code_obj1, code_obj2):
# use the new style code-classes' __cmp__ method, which
# should be faster and more sophisticated
# if this compare fails, we use the old routine to
# find out, what exactly is nor equal
# if this compare succeeds, simply return
# return
pass
if isinstance(code_obj1, object):
members = [x for x in dir(code_obj1) if x.startswith("co_")]
else:
members = dir(code_obj1)
members.sort() # ; members.reverse()
tokens1 = None
for member in members:
if member in __IGNORE_CODE_MEMBERS__ or verify != "verify":
pass
elif member == "co_code":
if verify != "strong":
continue
scanner = get_scanner(version, is_pypy, show_asm=False)
global JUMP_OPS
JUMP_OPS = list(JUMP_OPS) + ["JUMP_BACK"]
# use changed Token class
# We (re)set this here to save exception handling,
# which would get confusing.
scanner.setTokenClass(Token)
try:
# ingest both code-objects
tokens1, customize = scanner.ingest(code_obj1)
del customize # save memory
tokens2, customize = scanner.ingest(code_obj2)
del customize # save memory
finally:
scanner.resetTokenClass() # restore Token class
targets1 = dis.findlabels(code_obj1.co_code)
tokens1 = [t for t in tokens1 if t.kind != "COME_FROM"]
tokens2 = [t for t in tokens2 if t.kind != "COME_FROM"]
i1 = 0
i2 = 0
offset_map = {}
check_jumps = {}
while i1 < len(tokens1):
if i2 >= len(tokens2):
if (
len(tokens1) == len(tokens2) + 2
and tokens1[-1].kind == "RETURN_VALUE"
and tokens1[-2].kind == "LOAD_CONST"
and tokens1[-2].pattr is None
and tokens1[-3].kind == "RETURN_VALUE"
):
break
else:
raise CmpErrorCodeLen(name, tokens1, tokens2)
offset_map[tokens1[i1].offset] = tokens2[i2].offset
for idx1, idx2, offset2 in check_jumps.get(tokens1[i1].offset, []):
if offset2 != tokens2[i2].offset:
raise CmpErrorCode(
name,
tokens1[idx1].offset,
tokens1[idx1],
tokens2[idx2],
tokens1,
tokens2,
)
if tokens1[i1].kind != tokens2[i2].kind:
if tokens1[i1].kind == "LOAD_CONST" == tokens2[i2].kind:
i = 1
while tokens1[i1 + i].kind == "LOAD_CONST":
i += 1
if tokens1[i1 + i].kind.startswith(
("BUILD_TUPLE", "BUILD_LIST")
) and i == int(tokens1[i1 + i].kind.split("_")[-1]):
t = tuple([elem.pattr for elem in tokens1[i1 : i1 + i]])
if t != tokens2[i2].pattr:
raise CmpErrorCode(
name,
tokens1[i1].offset,
tokens1[i1],
tokens2[i2],
tokens1,
tokens2,
)
i1 += i + 1
i2 += 1
continue
elif (
i == 2
and tokens1[i1 + i].kind == "ROT_TWO"
and tokens2[i2 + 1].kind == "UNPACK_SEQUENCE_2"
):
i1 += 3
i2 += 2
continue
elif i == 2 and tokens1[i1 + i].kind in BIN_OP_FUNCS:
f = BIN_OP_FUNCS[tokens1[i1 + i].kind]
if (
f(tokens1[i1].pattr, tokens1[i1 + 1].pattr)
== tokens2[i2].pattr
):
i1 += 3
i2 += 1
continue
elif tokens1[i1].kind == "UNARY_NOT":
if tokens2[i2].kind == "POP_JUMP_IF_TRUE":
if tokens1[i1 + 1].kind == "POP_JUMP_IF_FALSE":
i1 += 2
i2 += 1
continue
elif tokens2[i2].kind == "POP_JUMP_IF_FALSE":
if tokens1[i1 + 1].kind == "POP_JUMP_IF_TRUE":
i1 += 2
i2 += 1
continue
elif (
tokens1[i1].kind in ("JUMP_FORWARD", "JUMP_BACK")
and tokens1[i1 - 1].kind == "RETURN_VALUE"
and tokens2[i2 - 1].kind in ("RETURN_VALUE", "RETURN_END_IF")
and int(tokens1[i1].offset) not in targets1
):
i1 += 1
continue
elif (
tokens1[i1].kind == "JUMP_BACK"
and tokens2[i2].kind == "CONTINUE"
):
# FIXME: should make sure that offset is inside loop, not outside of it
i1 += 2
i2 += 2
continue
elif (
tokens1[i1].kind == "JUMP_FORWARD"
and tokens2[i2].kind == "JUMP_BACK"
and tokens1[i1 + 1].kind == "JUMP_BACK"
and tokens2[i2 + 1].kind == "JUMP_BACK"
and int(tokens1[i1].pattr) == int(tokens1[i1].offset) + 3
):
if int(tokens1[i1].pattr) == int(tokens1[i1 + 1].offset):
i1 += 2
i2 += 2
continue
elif (
tokens1[i1].kind == "LOAD_NAME"
and tokens2[i2].kind == "LOAD_CONST"
and tokens1[i1].pattr == "None"
and tokens2[i2].pattr is None
):
pass
elif (
tokens1[i1].kind == "LOAD_GLOBAL"
and tokens2[i2].kind == "LOAD_NAME"
and tokens1[i1].pattr == tokens2[i2].pattr
):
pass
elif (
tokens1[i1].kind == "LOAD_ASSERT"
and tokens2[i2].kind == "LOAD_NAME"
and tokens1[i1].pattr == tokens2[i2].pattr
):
pass
elif (
tokens1[i1].kind == "RETURN_VALUE"
and tokens2[i2].kind == "RETURN_END_IF"
):
pass
elif (
tokens1[i1].kind == "BUILD_TUPLE_0" and tokens2[i2].pattr == ()
):
pass
else:
raise CmpErrorCode(
name,
tokens1[i1].offset,
tokens1[i1],
tokens2[i2],
tokens1,
tokens2,
)
elif (
tokens1[i1].kind in JUMP_OPS
and tokens1[i1].pattr != tokens2[i2].pattr
):
if tokens1[i1].kind == "JUMP_BACK":
dest1 = int(tokens1[i1].pattr)
dest2 = int(tokens2[i2].pattr)
if offset_map[dest1] != dest2:
raise CmpErrorCode(
name,
tokens1[i1].offset,
tokens1[i1],
tokens2[i2],
tokens1,
tokens2,
)
else:
# import pdb; pdb.set_trace()
try:
dest1 = int(tokens1[i1].pattr)
if dest1 in check_jumps:
check_jumps[dest1].append((i1, i2, dest2))
else:
check_jumps[dest1] = [(i1, i2, dest2)]
except Exception:
pass
i1 += 1
i2 += 1
del tokens1, tokens2 # save memory
elif member == "co_consts":
# partial optimization can make the co_consts look different,
# so we'll just compare the code consts
codes1 = (c for c in code_obj1.co_consts if hasattr(c, "co_consts"))
codes2 = (c for c in code_obj2.co_consts if hasattr(c, "co_consts"))
for c1, c2 in zip(codes1, codes2):
cmp_code_objects(version, is_pypy, c1, c2, verify, name=name)
elif member == "co_flags":
flags1 = code_obj1.co_flags
flags2 = code_obj2.co_flags
if is_pypy:
# For PYPY for now we don't care about PYPY_SOURCE_IS_UTF8:
flags2 &= ~0x0100 # PYPY_SOURCE_IS_UTF8
# We also don't care about COROUTINE or GENERATOR for now
flags1 &= ~0x000000A0
flags2 &= ~0x000000A0
if flags1 != flags2:
raise CmpErrorMember(
name,
"co_flags",
pretty_code_flags(flags1),
pretty_code_flags(flags2),
)
else:
# all other members must be equal
if getattr(code_obj1, member) != getattr(code_obj2, member):
raise CmpErrorMember(
name, member, getattr(code_obj1, member), getattr(code_obj2, member)
)
class Token(ScannerToken):
"""Token class with changed semantics for 'cmp()'."""
def __cmp__(self, o):
t = self.kind # shortcut
if t == "BUILD_TUPLE_0" and o.kind == "LOAD_CONST" and o.pattr == ():
return 0
if t == "COME_FROM" == o.kind:
return 0
if t == "PRINT_ITEM_CONT" and o.kind == "PRINT_ITEM":
return 0
if t == "RETURN_VALUE" and o.kind == "RETURN_END_IF":
return 0
if t == "JUMP_IF_FALSE_OR_POP" and o.kind == "POP_JUMP_IF_FALSE":
return 0
if JUMP_OPS and t in JUMP_OPS:
# ignore offset
return t == o.kind
return (t == o.kind) or self.pattr == o.pattr
def __repr__(self):
return "%s %s (%s)" % (str(self.kind), str(self.attr), repr(self.pattr))
def __str__(self):
return "%s\t%-17s %r" % (self.offset, self.kind, self.pattr)
def compare_code_with_srcfile(pyc_filename, src_filename, verify):
"""Compare a .pyc with a source code file. If everything is okay, None
is returned. Otherwise a string message describing the mismatch is returned.
"""
(
version,
timestamp,
magic_int,
code_obj1,
is_pypy,
source_size,
sip_hash,
) = load_module(pyc_filename)
if magic_int != PYTHON_MAGIC_INT:
msg = (
"Can't compare code - Python is running with magic %s, but code is magic %s "
% (PYTHON_MAGIC_INT, magic_int)
)
return msg
try:
code_obj2 = load_file(src_filename)
except SyntaxError as e:
# src_filename can be the first of a group sometimes
return str(e).replace(src_filename, pyc_filename)
cmp_code_objects(version, is_pypy, code_obj1, code_obj2, verify)
if verify == "verify-run":
try:
retcode = call("%s %s" % (sys.executable, src_filename), shell=True)
if retcode != 0:
return "Child was terminated by signal %d" % retcode
pass
except OSError as e:
return "Execution failed: %s" % e
pass
return None
def compare_files(pyc_filename1, pyc_filename2, verify):
"""Compare two .pyc files."""
(
version1,
timestamp,
magic_int1,
code_obj1,
is_pypy,
source_size,
sip_hash,
) = uncompyle6.load_module(pyc_filename1)
(
version2,
timestamp,
magic_int2,
code_obj2,
is_pypy,
source_size,
sip_hash,
) = uncompyle6.load_module(pyc_filename2)
if (magic_int1 != magic_int2) and verify == "verify":
verify = "weak_verify"
cmp_code_objects(version1, is_pypy, code_obj1, code_obj2, verify)
if __name__ == "__main__":
t1 = Token("LOAD_CONST", None, "code_object _expandLang", 52)
t2 = Token("LOAD_CONST", -421, "code_object _expandLang", 55)
print(repr(t1))
print(repr(t2))
print(t1.kind == t2.kind, t1.attr == t2.attr)