BETA10: reccomp support and Ghidra imports (#1091)

* Implement core functionality (before linter) * run linter + formatter * Review: remove redundant code * Implement end of range check for vtables --------- Co-authored-by: jonschz <jonschz@users.noreply.github.com>
2024-11-26 23:10:35 +00:00 · 2024-08-29 20:54:23 +02:00 · 2024-08-29 20:54:23 +02:00 · b898d98515
commit b898d98515
parent ce3fc57025
11 changed files with 254 additions and 97 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,11 +12,16 @@ ENV/
 VENV/
 env.bak/
 venv.bak/
 ISLE.EXE
 LEGO1.DLL
 /build/
 /build_debug/
 /legobin/
 *.swp
 LEGO1PROGRESS.*
 ISLEPROGRESS.*
 *.pyc
 tools/ghidra_scripts/import.log
 # By convention we put the retail binaries into ./legobin.
 # These entries are kept for now since that convention has not always been around.
 ISLE.EXE
 LEGO1.DLL
--- a/LEGO1/lego/legoomni/src/race/legoracespecial.cpp
+++ b/LEGO1/lego/legoomni/src/race/legoracespecial.cpp
@ -209,7 +209,7 @@ void LegoCarRaceActor::SwitchBoundary(LegoPathBoundary*& p_boundary, LegoUnknown
 }
 // FUNCTION: LEGO1 0x10080b70
-// FUNCTION: BETA10 0x1000366b
+// FUNCTION: BETA10 0x100cdbae
 void LegoCarRaceActor::VTable0x70(float p_float)
 {
 	// m_unk0x0c is not an MxBool, there are places where it is set to 2 or higher
--- a/LEGO1/realtime/vector.h
+++ b/LEGO1/realtime/vector.h
@ -89,7 +89,7 @@ public:
 	virtual float Dot(float* p_a, float* p_b) const { return DotImpl(p_a, p_b); } // vtable+0x3c
 	// FUNCTION: LEGO1 0x100020f0
-	// FUNCTION: BETA10 0x100028f6
+	// FUNCTION: BETA10 0x100108c0
 	virtual float Dot(Vector2* p_a, Vector2* p_b) const { return DotImpl(p_a->m_data, p_b->m_data); } // vtable+0x38
 	// FUNCTION: LEGO1 0x10002110
@ -188,7 +188,7 @@ public:
 	// in reverse order of appearance.
 	// FUNCTION: LEGO1 0x10002270
-	// FUNCTION: BETA10 0x100064a1
+	// FUNCTION: BETA10 0x10011350
 	virtual void EqualsCrossImpl(float* p_a, float* p_b)
 	{
 		m_data[0] = p_a[1] * p_b[2] - p_a[2] * p_b[1];
@ -275,7 +275,7 @@ public:
 	void EqualsImpl(float* p_data) override { memcpy(m_data, p_data, sizeof(float) * 3); } // vtable+0x20
 	// FUNCTION: LEGO1 0x10003bc0
-	// FUNCTION: BETA10 0x1000132a
+	// FUNCTION: BETA10 0x100114f0
 	void Clear() override { memset(m_data, 0, sizeof(float) * 3); } // vtable+0x2c
 	// FUNCTION: LEGO1 0x10003bd0
--- a/tools/ghidra_scripts/import_functions_and_types_from_pdb.py
+++ b/tools/ghidra_scripts/import_functions_and_types_from_pdb.py
@ -21,6 +21,7 @@
 # Disable spurious warnings in vscode / pylance
 # pyright: reportMissingModuleSource=false
 from enum import Enum
 import importlib
 from dataclasses import dataclass, field
 import logging.handlers
@ -64,6 +65,25 @@ class Globals:
    statistics: Statistics = field(default_factory=Statistics)
 class SupportedModules(Enum):
    LEGO1 = 1
    BETA10 = 2
    def orig_filename(self):
        if self == self.LEGO1:
            return "LEGO1.DLL"
        return "BETA10.DLL"
    def recomp_filename_without_extension(self):
        # in case we want to support more functions
        return "LEGO1"
    def build_dir_name(self):
        if self == self.BETA10:
            return "build_debug"
        return "build"
 # hard-coded settings that we don't want to prompt in Ghidra every time
 GLOBALS = Globals(
    verbose=False,
@ -133,7 +153,7 @@ def import_function_into_ghidra(
    # Find the Ghidra function at that address
    ghidra_address = getAddressFactory().getAddress(hex_original_address)
    # pylint: disable=possibly-used-before-assignment
-    function_importer = PdbFunctionImporter(api, pdb_function, type_importer)
+    function_importer = PdbFunctionImporter.build(api, pdb_function, type_importer)
    ghidra_function = getFunctionAt(ghidra_address)
    if ghidra_function is None:
@ -208,11 +228,29 @@ def log_and_track_failure(
 def main():
    if GLOBALS.running_from_ghidra:
        origfile_name = getProgramFile().getName()
        if origfile_name == "LEGO1.DLL":
            module = SupportedModules.LEGO1
        elif origfile_name in ["LEGO1D.DLL", "BETA10.DLL"]:
            module = SupportedModules.BETA10
        else:
            raise Lego1Exception(
                f"Unsupported file name in import script: {origfile_name}"
            )
    else:
        module = SupportedModules.LEGO1
    logger.info("Importing file: %s", module.orig_filename())
    repo_root = get_repository_root()
-    origfile_path = repo_root.joinpath("LEGO1.DLL")
+    origfile_path = repo_root.joinpath("legobin").joinpath(module.orig_filename())
-    build_path = repo_root.joinpath("build")
+    build_directory = repo_root.joinpath(module.build_dir_name())
-    recompiledfile_path = build_path.joinpath("LEGO1.DLL")
+    recompiledfile_name = f"{module.recomp_filename_without_extension()}.DLL"
-    pdb_path = build_path.joinpath("LEGO1.pdb")
+    recompiledfile_path = build_directory.joinpath(recompiledfile_name)
    pdbfile_name = f"{module.recomp_filename_without_extension()}.PDB"
    pdbfile_path = build_directory.joinpath(pdbfile_name)
    if not GLOBALS.verbose:
        logging.getLogger("isledecomp.bin").setLevel(logging.WARNING)
@ -225,7 +263,9 @@ def main():
    with Bin(str(origfile_path), find_str=True) as origfile, Bin(
        str(recompiledfile_path)
    ) as recompfile:
-        isle_compare = IsleCompare(origfile, recompfile, str(pdb_path), str(repo_root))
+        isle_compare = IsleCompare(
            origfile, recompfile, str(pdbfile_path), str(repo_root)
        )
    logger.info("Comparison complete.")
--- a/tools/ghidra_scripts/lego_util/function_importer.py
+++ b/tools/ghidra_scripts/lego_util/function_importer.py
@ -5,6 +5,7 @@
 import logging
 from typing import Optional
 from abc import ABC, abstractmethod
 from ghidra.program.model.listing import Function, Parameter
 from ghidra.program.flatapi import FlatProgramAPI
@ -24,6 +25,7 @@ from lego_util.pdb_extraction import (
 )
 from lego_util.ghidra_helper import (
    add_data_type_or_reuse_existing,
    create_ghidra_namespace,
    get_or_add_pointer_type,
    get_ghidra_namespace,
    sanitize_name,
@ -32,12 +34,10 @@ from lego_util.ghidra_helper import (
 from lego_util.exceptions import StackOffsetMismatchError, Lego1Exception
 from lego_util.type_importer import PdbTypeImporter
 logger = logging.getLogger(__name__)
-# pylint: disable=too-many-instance-attributes
+class PdbFunctionImporter(ABC):
 class PdbFunctionImporter:
    """A representation of a function from the PDB with each type replaced by a Ghidra type instance."""
    def __init__(
@ -48,20 +48,79 @@ class PdbFunctionImporter:
    ):
        self.api = api
        self.match_info = func.match_info
        self.signature = func.signature
        self.is_stub = func.is_stub
        self.type_importer = type_importer
        if self.signature.class_type is not None:
            # Import the base class so the namespace exists
            self.type_importer.import_pdb_type_into_ghidra(self.signature.class_type)
        assert self.match_info.name is not None
        colon_split = sanitize_name(self.match_info.name).split("::")
        self.name = colon_split.pop()
        namespace_hierachy = colon_split
-        self.namespace = get_ghidra_namespace(api, namespace_hierachy)
+        self.namespace = self._do_get_namespace(namespace_hierachy)
    def _do_get_namespace(self, namespace_hierarchy: list[str]):
        return get_ghidra_namespace(self.api, namespace_hierarchy)
    def get_full_name(self) -> str:
        return f"{self.namespace.getName()}::{self.name}"
    @staticmethod
    def build(api: FlatProgramAPI, func: PdbFunction, type_importer: "PdbTypeImporter"):
        return (
            ThunkPdbFunctionImport(api, func, type_importer)
            if func.signature is None
            else FullPdbFunctionImporter(api, func, type_importer)
        )
    @abstractmethod
    def matches_ghidra_function(self, ghidra_function: Function) -> bool:
        ...
    @abstractmethod
    def overwrite_ghidra_function(self, ghidra_function: Function):
        ...
 class ThunkPdbFunctionImport(PdbFunctionImporter):
    """For importing thunk functions (like vtordisp or debug build thunks) into Ghidra.
    Only the name of the function will be imported."""
    def _do_get_namespace(self, namespace_hierarchy: list[str]):
        """We need to create the namespace because we don't import the return type here"""
        return create_ghidra_namespace(self.api, namespace_hierarchy)
    def matches_ghidra_function(self, ghidra_function: Function) -> bool:
        name_match = self.name == ghidra_function.getName(False)
        namespace_match = self.namespace == ghidra_function.getParentNamespace()
        logger.debug("Matches: namespace=%s name=%s", namespace_match, name_match)
        return name_match and namespace_match
    def overwrite_ghidra_function(self, ghidra_function: Function):
        ghidra_function.setName(self.name, SourceType.USER_DEFINED)
        ghidra_function.setParentNamespace(self.namespace)
 # pylint: disable=too-many-instance-attributes
 class FullPdbFunctionImporter(PdbFunctionImporter):
    """For importing functions into Ghidra where all information are available."""
    def __init__(
        self,
        api: FlatProgramAPI,
        func: PdbFunction,
        type_importer: "PdbTypeImporter",
    ):
        super().__init__(api, func, type_importer)
        assert func.signature is not None
        self.signature = func.signature
        self.is_stub = func.is_stub
        if self.signature.class_type is not None:
            # Import the base class so the namespace exists
            self.type_importer.import_pdb_type_into_ghidra(self.signature.class_type)
        self.return_type = type_importer.import_pdb_type_into_ghidra(
            self.signature.return_type
@ -75,17 +134,6 @@ class PdbFunctionImporter:
            for (index, type_name) in enumerate(self.signature.arglist)
        ]
    @property
    def call_type(self):
        return self.signature.call_type
    @property
    def stack_symbols(self):
        return self.signature.stack_symbols
    def get_full_name(self) -> str:
        return f"{self.namespace.getName()}::{self.name}"
    def matches_ghidra_function(self, ghidra_function: Function) -> bool:
        """Checks whether this function declaration already matches the description in Ghidra"""
        name_match = self.name == ghidra_function.getName(False)
@ -235,7 +283,7 @@ class PdbFunctionImporter:
        ghidra_function.setName(self.name, SourceType.USER_DEFINED)
        ghidra_function.setParentNamespace(self.namespace)
        ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
-        ghidra_function.setCallingConvention(self.call_type)
+        ghidra_function.setCallingConvention(self.signature.call_type)
        if self.is_stub:
            logger.debug(
@ -306,7 +354,7 @@ class PdbFunctionImporter:
        return next(
            (
                symbol
-                for symbol in self.stack_symbols
+                for symbol in self.signature.stack_symbols
                if isinstance(symbol, CppStackSymbol)
                and symbol.stack_offset == stack_offset
            ),
@ -319,7 +367,7 @@ class PdbFunctionImporter:
        return next(
            (
                symbol
-                for symbol in self.stack_symbols
+                for symbol in self.signature.stack_symbols
                if isinstance(symbol, CppRegisterSymbol) and symbol.register == register
            ),
            None,
--- a/tools/ghidra_scripts/lego_util/ghidra_helper.py
+++ b/tools/ghidra_scripts/lego_util/ghidra_helper.py
@ -1,6 +1,7 @@
 """A collection of helper functions for the interaction with Ghidra."""
 import logging
 import re
 from lego_util.exceptions import (
    ClassOrNamespaceNotFoundInGhidraError,
@ -80,25 +81,42 @@ def create_ghidra_namespace(
    return namespace
 # These appear in debug builds
 THUNK_OF_RE = re.compile(r"^Thunk of '(.*)'$")
 def sanitize_name(name: str) -> str:
    """
    Takes a full class or function name and replaces characters not accepted by Ghidra.
-    Applies mostly to templates and names like `vbase destructor`.
+    Applies mostly to templates, names like `vbase destructor`, and thunks in debug build.
    """
-    new_class_name = (
+    if (match := THUNK_OF_RE.fullmatch(name)) is not None:
        is_thunk = True
        name = match.group(1)
    else:
        is_thunk = False
    # Replace characters forbidden in Ghidra
    new_name = (
        name.replace("<", "[")
        .replace(">", "]")
        .replace("*", "#")
        .replace(" ", "_")
        .replace("`", "'")
    )
    if "<" in name:
        new_class_name = "_template_" + new_class_name
-    if new_class_name != name:
+    if "<" in name:
-        logger.warning(
+        new_name = "_template_" + new_name
-            "Class or function name contains characters forbidden by Ghidra, changing from '%s' to '%s'",
+
    if is_thunk:
        split = new_name.split("::")
        split[-1] = "_thunk_" + split[-1]
        new_name = "::".join(split)
    if new_name != name:
        logger.info(
            "Changed class or function name from '%s' to '%s' to avoid Ghidra issues",
            name,
-            new_class_name,
+            new_name,
        )
-    return new_class_name
+    return new_name
--- a/tools/ghidra_scripts/lego_util/headers.pyi
+++ b/tools/ghidra_scripts/lego_util/headers.pyi
@ -1,4 +1,4 @@
-from typing import TypeVar
+from typing import TypeVar, Any
 import ghidra
 # pylint: disable=invalid-name,unused-argument
@ -17,3 +17,4 @@ def getFunctionAt(
 def createFunction(
    entryPoint: ghidra.program.model.address.Address, name: str
 ) -> ghidra.program.model.listing.Function: ...
 def getProgramFile() -> Any: ...  # actually java.io.File
--- a/tools/ghidra_scripts/lego_util/pdb_extraction.py
+++ b/tools/ghidra_scripts/lego_util/pdb_extraction.py
@ -3,6 +3,7 @@ import re
 from typing import Any, Optional
 import logging
 from isledecomp.bin import InvalidVirtualAddressError
 from isledecomp.cvdump.symbols import SymbolsEntry
 from isledecomp.compare import Compare as IsleCompare
 from isledecomp.compare.db import MatchInfo
@ -43,7 +44,7 @@ class FunctionSignature:
@dataclass
 class PdbFunction:
    match_info: MatchInfo
-    signature: FunctionSignature
+    signature: Optional[FunctionSignature]
    is_stub: bool
@ -74,9 +75,7 @@ class PdbFunctionExtractor:
    def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
        function_type_str = fn.func_type
        if function_type_str == "T_NOTYPE(0000)":
-            logger.debug(
+            logger.debug("Treating NOTYPE function as thunk: %s", fn.name)
                "Skipping a NOTYPE (synthetic or template + synthetic): %s", fn.name
            )
            return None
        # get corresponding function type
@ -145,8 +144,6 @@ class PdbFunctionExtractor:
        assert match_info.orig_addr is not None
        match_options = self.compare.get_match_options(match_info.orig_addr)
        assert match_options is not None
        if match_options.get("skip", False):
            return None
        function_data = next(
            (
@ -156,11 +153,19 @@ class PdbFunctionExtractor:
            ),
            None,
        )
-        if not function_data:
+        if function_data is None:
-            logger.error(
+            try:
-                "Did not find function in nodes, skipping: %s", match_info.name
+                # this can be either a thunk (which we want) or an external function
-            )
+                # (which we don't want), so we tell them apart based on the validity of their address.
-            return None
+                self.compare.orig_bin.get_relative_addr(match_info.orig_addr)
                return PdbFunction(match_info, None, False)
            except InvalidVirtualAddressError:
                logger.debug(
                    "Skipping external function %s (address 0x%x not in original binary)",
                    match_info.name,
                    match_info.orig_addr,
                )
                return None
        function_symbol = function_data.symbol_entry
        if function_symbol is None:
@ -171,8 +176,6 @@ class PdbFunctionExtractor:
            return None
        function_signature = self.get_func_signature(function_symbol)
        if function_signature is None:
            return None
        is_stub = match_options.get("stub", False)
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -479,14 +479,31 @@ class Compare:
        construct the name of the vtordisp function and match based on that."""
        for match in self._db.get_matches_by_type(SymbolType.VTABLE):
            assert (
                match.name is not None
                and match.orig_addr is not None
                and match.recomp_addr is not None
                and match.size is not None
            )
            # We need some method of identifying vtables that
            # might have thunks, and this ought to work okay.
            if "{for" not in match.name:
                continue
            next_orig = self._db.get_next_orig_addr(match.orig_addr)
            assert next_orig is not None
            orig_upper_size_limit = next_orig - match.orig_addr
            if orig_upper_size_limit < match.size:
                # This could happen in debug builds due to code changes between BETA10 and LEGO1,
                # but we have not seen it yet as of 2024-08-28.
                logger.warning(
                    "Recomp vtable is larger than orig vtable for %s",
                    match.name,
                )
            # TODO: We might want to fix this at the source (cvdump) instead.
            # Any problem will be logged later when we compare the vtable.
-            vtable_size = 4 * (match.size // 4)
+            vtable_size = 4 * (min(match.size, orig_upper_size_limit) // 4)
            orig_table = self.orig_bin.read(match.orig_addr, vtable_size)
            recomp_table = self.recomp_bin.read(match.recomp_addr, vtable_size)
@ -497,51 +514,65 @@ class Compare:
            # Now walk both vtables looking for thunks.
            for orig_addr, recomp_addr in raw_addrs:
-                if not self._db.is_vtordisp(recomp_addr):
+                if orig_addr == 0:
-                    continue
+                    # This happens in debug builds due to code changes between BETA10 and LEGO1.
                    # Note that there is a risk of running into the next vtable if there is no gap in between,
                    # which we cannot protect against at the moment.
                    logger.warning(
                        "Recomp vtable is larger than orig vtable for %s", match.name
                    )
                    break
-                thunk_fn = self.get_by_recomp(recomp_addr)
+                if self._db.is_vtordisp(recomp_addr):
                    self._match_vtordisp_in_vtable(orig_addr, recomp_addr)
-                # Read the function bytes here.
+    def _match_vtordisp_in_vtable(self, orig_addr, recomp_addr):
-                # In practice, the adjuster thunk will be under 16 bytes.
+        thunk_fn = self.get_by_recomp(recomp_addr)
-                # If we have thunks of unequal size, we can still tell whether
+        assert thunk_fn is not None
-                # they are thunking the same function by grabbing the
+        assert thunk_fn.size is not None
                # JMP instruction at the end.
                thunk_presumed_size = max(thunk_fn.size, 16)
-                # Strip off MSVC padding 0xcc bytes.
+        # Read the function bytes here.
-                # This should be safe to do; it is highly unlikely that
+        # In practice, the adjuster thunk will be under 16 bytes.
-                # the MSB of the jump displacement would be 0xcc. (huge jump)
+        # If we have thunks of unequal size, we can still tell whether they are thunking
-                orig_thunk_bin = self.orig_bin.read(
+        # the same function by grabbing the JMP instruction at the end.
-                    orig_addr, thunk_presumed_size
+        thunk_presumed_size = max(thunk_fn.size, 16)
                ).rstrip(b"\xcc")
-                recomp_thunk_bin = self.recomp_bin.read(
+        # Strip off MSVC padding 0xcc bytes.
-                    recomp_addr, thunk_presumed_size
+        # This should be safe to do; it is highly unlikely that
-                ).rstrip(b"\xcc")
+        # the MSB of the jump displacement would be 0xcc. (huge jump)
        orig_thunk_bin = self.orig_bin.read(orig_addr, thunk_presumed_size).rstrip(
            b"\xcc"
        )
-                # Read jump opcode and displacement (last 5 bytes)
+        recomp_thunk_bin = self.recomp_bin.read(
-                (orig_jmp, orig_disp) = struct.unpack("<Bi", orig_thunk_bin[-5:])
+            recomp_addr, thunk_presumed_size
-                (recomp_jmp, recomp_disp) = struct.unpack("<Bi", recomp_thunk_bin[-5:])
+        ).rstrip(b"\xcc")
-                # Make sure it's a JMP
+        # Read jump opcode and displacement (last 5 bytes)
-                if orig_jmp != 0xE9 or recomp_jmp != 0xE9:
+        (orig_jmp, orig_disp) = struct.unpack("<Bi", orig_thunk_bin[-5:])
-                    continue
+        (recomp_jmp, recomp_disp) = struct.unpack("<Bi", recomp_thunk_bin[-5:])
-                # Calculate jump destination from the end of the JMP instruction
+        # Make sure it's a JMP
-                # i.e. the end of the function
+        if orig_jmp != 0xE9 or recomp_jmp != 0xE9:
-                orig_actual = orig_addr + len(orig_thunk_bin) + orig_disp
+            logger.warning(
-                recomp_actual = recomp_addr + len(recomp_thunk_bin) + recomp_disp
+                "Not a jump in vtordisp at (0x%x, 0x%x)", orig_addr, recomp_addr
            )
            return
-                # If they are thunking the same function, then this must be a match.
+        # Calculate jump destination from the end of the JMP instruction
-                if self.is_pointer_match(orig_actual, recomp_actual):
+        # i.e. the end of the function
-                    if len(orig_thunk_bin) != len(recomp_thunk_bin):
+        orig_actual = orig_addr + len(orig_thunk_bin) + orig_disp
-                        logger.warning(
+        recomp_actual = recomp_addr + len(recomp_thunk_bin) + recomp_disp
-                            "Adjuster thunk %s (0x%x) is not exact",
+
-                            thunk_fn.name,
+        # If they are thunking the same function, then this must be a match.
-                            orig_addr,
+        if self.is_pointer_match(orig_actual, recomp_actual):
-                        )
+            if len(orig_thunk_bin) != len(recomp_thunk_bin):
-                    self._db.set_function_pair(orig_addr, recomp_addr)
+                logger.warning(
                    "Adjuster thunk %s (0x%x) is not exact",
                    thunk_fn.name,
                    orig_addr,
                )
            self._db.set_function_pair(orig_addr, recomp_addr)
    def _dump_asm(self, orig_combined, recomp_combined):
        """Append the provided assembly output to the debug files"""
--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@ -1,5 +1,6 @@
 """Wrapper for database (here an in-memory sqlite database) that collects the
 addresses/symbols that we want to compare between the original and recompiled binaries."""
 import sqlite3
 import logging
 from typing import Any, List, Optional
@ -363,6 +364,10 @@ class CompareDb:
        if "`vtordisp" in name:
            return True
        if decorated_name is None:
            # happens in debug builds, e.g. for "Thunk of 'LegoAnimActor::ClassName'"
            return False
        new_name = get_vtordisp_name(decorated_name)
        if new_name is None:
            return False
--- a/util/decomp.h
+++ b/util/decomp.h
@ -1,6 +1,12 @@
 #ifndef DECOMP_H
 #define DECOMP_H
 #ifndef NDEBUG
 // Disable size assertions for debug builds because the sizes differ between debug and release builds.
 // The release LEGO1.DLL is what we ultimately want to decompile, so this is what we assert against.
 #undef ENABLE_DECOMP_ASSERTS
 #endif
 #if defined(ENABLE_DECOMP_ASSERTS)
 #define DECOMP_STATIC_ASSERT(V)                                                                                        \
 	namespace                                                                                                          \