mirror of
https://github.com/isledecomp/isle.git
synced 2024-11-23 05:19:47 +00:00
Brute force string search for BETA10 (#1097)
* Brute force string search for BETA10 * improved string check * Skip this unless source binary is debug * remove misplaced comment
This commit is contained in:
parent
2af5f87051
commit
30be1ed4b8
@ -465,6 +465,22 @@ class Bin:
|
||||
for (func_addr, name_addr) in combined
|
||||
]
|
||||
|
||||
def iter_string(self, encoding: str = "ascii") -> Iterator[Tuple[int, str]]:
|
||||
"""Search for possible strings at each verified address in .data."""
|
||||
section = self.get_section_by_name(".data")
|
||||
for addr in self._relocated_addrs:
|
||||
if section.contains_vaddr(addr):
|
||||
raw = self.read_string(addr)
|
||||
if raw is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
string = raw.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
yield (addr, string)
|
||||
|
||||
def get_section_by_name(self, name: str) -> Section:
|
||||
section = next(
|
||||
filter(lambda section: section.match_name(name), self.sections),
|
||||
|
@ -82,8 +82,9 @@ class Compare:
|
||||
|
||||
self._load_cvdump()
|
||||
self._load_markers()
|
||||
self._find_original_strings()
|
||||
# Detect floats first to eliminate potential overlap with string data
|
||||
self._find_float_const()
|
||||
self._find_original_strings()
|
||||
self._match_imports()
|
||||
self._match_exports()
|
||||
self._match_thunks()
|
||||
@ -314,7 +315,7 @@ class Compare:
|
||||
"""Go to the original binary and look for the specified string constants
|
||||
to find a match. This is a (relatively) expensive operation so we only
|
||||
look at strings that we have not already matched via a STRING annotation."""
|
||||
|
||||
# Release builds give each de-duped string a symbol so they are easy to find and match.
|
||||
for string in self._db.get_unmatched_strings():
|
||||
addr = self.orig_bin.find_string(string.encode("latin1"))
|
||||
if addr is None:
|
||||
@ -324,6 +325,40 @@ class Compare:
|
||||
|
||||
self._db.match_string(addr, string)
|
||||
|
||||
def is_real_string(s: str) -> bool:
|
||||
"""Heuristic to ignore values that only look like strings.
|
||||
This is mostly about short strings (len <= 4) that could be byte or word values.
|
||||
"""
|
||||
# 0x10 is the MSB of the address space for DLLs (LEGO1), so this is a pointer
|
||||
if len(s) == 0 or "\x10" in s:
|
||||
return False
|
||||
|
||||
# assert(0) is common
|
||||
if len(s) == 1 and s[0] != "0":
|
||||
return False
|
||||
|
||||
# Hack because str.isprintable() will fail on strings with newlines or tabs
|
||||
if len(s) <= 4 and "\\x" in repr(s):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# Debug builds do not de-dupe the strings, so we need to find them via brute force scan.
|
||||
# We could try to match the string addrs if there is only one in orig and recomp.
|
||||
# When we sanitize the asm, the result is the same regardless.
|
||||
if self.orig_bin.is_debug:
|
||||
for addr, string in self.orig_bin.iter_string("latin1"):
|
||||
if is_real_string(string):
|
||||
self._db.set_orig_symbol(
|
||||
addr, SymbolType.STRING, string, len(string)
|
||||
)
|
||||
|
||||
for addr, string in self.recomp_bin.iter_string("latin1"):
|
||||
if is_real_string(string):
|
||||
self._db.set_recomp_symbol(
|
||||
addr, SymbolType.STRING, string, None, len(string)
|
||||
)
|
||||
|
||||
def _find_float_const(self):
|
||||
"""Add floating point constants in each binary to the database.
|
||||
We are not matching anything right now because these values are not
|
||||
|
Loading…
Reference in New Issue
Block a user