Ghidra Import: Support virtual inheritance (#1071)

* Implement/fix Ghidra imports for multiple and virtual inheritance Unfortunately, the handling in Ghidra is still far from perfect. This is a good place to start, though. * Support offsets in vbase pointers * Support `this adjust` * minor stylistic improvement * Improvements to documentation --------- Co-authored-by: jonschz <jonschz@users.noreply.github.com>
2024-11-27 07:20:41 +00:00 · 2024-07-30 19:02:15 +02:00 · 2024-07-30 19:02:15 +02:00 · 412200ecbc
commit 412200ecbc
parent 1f251ff817
6 changed files with 549 additions and 72 deletions
--- a/tools/ghidra_scripts/lego_util/function_importer.py
+++ b/tools/ghidra_scripts/lego_util/function_importer.py
@ -10,6 +10,12 @@ from ghidra.program.model.listing import Function, Parameter
 from ghidra.program.flatapi import FlatProgramAPI
 from ghidra.program.model.listing import ParameterImpl
 from ghidra.program.model.symbol import SourceType
+from ghidra.program.model.data import (
+    TypeDef,
+    TypedefDataType,
+    Pointer,
+    ComponentOffsetSettingsDefinition,
+)

 from lego_util.pdb_extraction import (
    PdbFunction,
@ -17,12 +23,13 @@ from lego_util.pdb_extraction import (
    CppStackSymbol,
 )
 from lego_util.ghidra_helper import (
-    add_pointer_type,
+    add_data_type_or_reuse_existing,
+    get_or_add_pointer_type,
    get_ghidra_namespace,
    sanitize_name,
 )

-from lego_util.exceptions import StackOffsetMismatchError
+from lego_util.exceptions import StackOffsetMismatchError, Lego1Exception
 from lego_util.type_importer import PdbTypeImporter


@ -91,7 +98,10 @@ class PdbFunctionImporter:
        if (
            (not return_type_match)
            and (self.return_type.getLength() > 4)
-            and (add_pointer_type(self.api, self.return_type) == ghidra_return_type)
+            and (
+                get_or_add_pointer_type(self.api, self.return_type)
+                == ghidra_return_type
+            )
            and any(
                param
                for param in ghidra_function.getParameters()
@ -103,19 +113,22 @@ class PdbFunctionImporter:
            )
            return_type_match = True

-        # match arguments: decide if thiscall or not
+        # match arguments: decide if thiscall or not, and whether the `this` type matches
        thiscall_matches = (
            self.signature.call_type == ghidra_function.getCallingConventionName()
        )

+        ghidra_params_without_this = list(ghidra_function.getParameters())
+
+        if thiscall_matches and self.signature.call_type == "__thiscall":
+            this_argument = ghidra_params_without_this.pop(0)
+            thiscall_matches = self._this_type_match(this_argument)
+
        if self.is_stub:
            # We do not import the argument list for stubs, so it should be excluded in matches
            args_match = True
        elif thiscall_matches:
-            if self.signature.call_type == "__thiscall":
-                args_match = self._matches_thiscall_parameters(ghidra_function)
-            else:
-                args_match = self._matches_non_thiscall_parameters(ghidra_function)
+            args_match = self._parameter_lists_match(ghidra_params_without_this)
        else:
            args_match = False

@ -136,16 +149,22 @@ class PdbFunctionImporter:
            and args_match
        )

-    def _matches_non_thiscall_parameters(self, ghidra_function: Function) -> bool:
-        return self._parameter_lists_match(ghidra_function.getParameters())
+    def _this_type_match(self, this_parameter: Parameter) -> bool:
+        if this_parameter.getName() != "this":
+            logger.info("Expected first argument to be `this` in __thiscall")
+            return False

-    def _matches_thiscall_parameters(self, ghidra_function: Function) -> bool:
-        ghidra_params = list(ghidra_function.getParameters())
+        if self.signature.this_adjust != 0:
+            # In this case, the `this` argument should be custom defined
+            if not isinstance(this_parameter.getDataType(), TypeDef):
+                logger.info(
+                    "`this` argument is not a typedef while `this adjust` = %d",
+                    self.signature.this_adjust,
+                )
+                return False
+            # We are not checking for the _correct_ `this` type here, which we could do in the future

-        # remove the `this` argument which we don't generate ourselves
-        ghidra_params.pop(0)
-
-        return self._parameter_lists_match(ghidra_params)
+        return True

    def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
        # Remove return storage pointer from comparison if present.
@ -194,6 +213,25 @@ class PdbFunctionImporter:

    def overwrite_ghidra_function(self, ghidra_function: Function):
        """Replace the function declaration in Ghidra by the one derived from C++."""
+
+        if ghidra_function.hasCustomVariableStorage():
+            # Unfortunately, calling `ghidra_function.setCustomVariableStorage(False)`
+            # leads to two `this` parameters. Therefore, we first need to remove all `this` parameters
+            # and then re-generate a new one
+            ghidra_function.replaceParameters(
+                Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,  # this implicitly sets custom variable storage to False
+                True,
+                SourceType.USER_DEFINED,
+                [
+                    param
+                    for param in ghidra_function.getParameters()
+                    if param.getName() != "this"
+                ],
+            )
+
+        if ghidra_function.hasCustomVariableStorage():
+            raise Lego1Exception("Failed to disable custom variable storage.")
+
        ghidra_function.setName(self.name, SourceType.USER_DEFINED)
        ghidra_function.setParentNamespace(self.namespace)
        ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
@ -203,16 +241,18 @@ class PdbFunctionImporter:
            logger.debug(
                "%s is a stub, skipping parameter import", self.get_full_name()
            )
-            return
+        else:
+            ghidra_function.replaceParameters(
+                Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
+                True,  # force
+                SourceType.USER_DEFINED,
+                self.arguments,
+            )
+            self._import_parameter_names(ghidra_function)

-        ghidra_function.replaceParameters(
-            Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
-            True,  # force
-            SourceType.USER_DEFINED,
-            self.arguments,
-        )
-
-        self._import_parameter_names(ghidra_function)
+        # Special handling for `this adjust` and virtual inheritance
+        if self.signature.this_adjust != 0:
+            self._set_this_adjust(ghidra_function)

    def _import_parameter_names(self, ghidra_function: Function):
        # When we call `ghidra_function.replaceParameters`, Ghidra will generate the layout.
@ -284,3 +324,50 @@ class PdbFunctionImporter:
            ),
            None,
        )
+
+    def _set_this_adjust(
+        self,
+        ghidra_function: Function,
+    ):
+        """
+        When `this adjust` is non-zero, the pointer type of `this` needs to be replaced by an offset version.
+        The offset can only be set on a typedef on the pointer. We also must enable custom storage so we can modify
+        the auto-generated `this` parameter.
+        """
+
+        # Necessary in order to overwite the auto-generated `this`
+        ghidra_function.setCustomVariableStorage(True)
+
+        this_parameter = next(
+            (
+                param
+                for param in ghidra_function.getParameters()
+                if param.isRegisterVariable() and param.getName() == "this"
+            ),
+            None,
+        )
+
+        if this_parameter is None:
+            logger.error(
+                "Failed to find `this` parameter in a function with `this adjust = %d`",
+                self.signature.this_adjust,
+            )
+        else:
+            current_ghidra_type = this_parameter.getDataType()
+            assert isinstance(current_ghidra_type, Pointer)
+            class_name = current_ghidra_type.getDataType().getName()
+            typedef_name = f"{class_name}PtrOffset0x{self.signature.this_adjust:x}"
+
+            typedef_ghidra_type = TypedefDataType(
+                current_ghidra_type.getCategoryPath(),
+                typedef_name,
+                current_ghidra_type,
+            )
+            ComponentOffsetSettingsDefinition.DEF.setValue(
+                typedef_ghidra_type.getDefaultSettings(), self.signature.this_adjust
+            )
+            typedef_ghidra_type = add_data_type_or_reuse_existing(
+                self.api, typedef_ghidra_type
+            )
+
+            this_parameter.setDataType(typedef_ghidra_type, SourceType.USER_DEFINED)
--- a/tools/ghidra_scripts/lego_util/ghidra_helper.py
+++ b/tools/ghidra_scripts/lego_util/ghidra_helper.py
@ -11,10 +11,8 @@ from lego_util.exceptions import (
 # Disable spurious warnings in vscode / pylance
 # pyright: reportMissingModuleSource=false

-from ghidra.program.model.data import PointerDataType
-from ghidra.program.model.data import DataTypeConflictHandler
 from ghidra.program.flatapi import FlatProgramAPI
-from ghidra.program.model.data import DataType
+from ghidra.program.model.data import DataType, DataTypeConflictHandler, PointerDataType
 from ghidra.program.model.symbol import Namespace

 logger = logging.getLogger(__name__)
@ -37,9 +35,15 @@ def get_ghidra_type(api: FlatProgramAPI, type_name: str):
    raise MultipleTypesFoundInGhidraError(type_name, result)


-def add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
-    new_data_type = PointerDataType(pointee)
-    new_data_type.setCategoryPath(pointee.getCategoryPath())
+def get_or_add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
+    new_pointer_data_type = PointerDataType(pointee)
+    new_pointer_data_type.setCategoryPath(pointee.getCategoryPath())
+    return add_data_type_or_reuse_existing(api, new_pointer_data_type)
+
+
+def add_data_type_or_reuse_existing(
+    api: FlatProgramAPI, new_data_type: DataType
+) -> DataType:
    result_data_type = (
        api.getCurrentProgram()
        .getDataTypeManager()
@ -47,7 +51,7 @@ def add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
    )
    if result_data_type is not new_data_type:
        logger.debug(
-            "New pointer replaced by existing one. Fresh pointer: %s (class: %s)",
+            "Reusing existing data type instead of new one: %s (class: %s)",
            result_data_type,
            result_data_type.__class__,
        )
--- a/tools/ghidra_scripts/lego_util/pdb_extraction.py
+++ b/tools/ghidra_scripts/lego_util/pdb_extraction.py
@ -36,6 +36,8 @@ class FunctionSignature:
    return_type: str
    class_type: Optional[str]
    stack_symbols: list[CppStackOrRegisterSymbol]
+    # if non-zero: an offset to the `this` parameter in a __thiscall
+    this_adjust: int


@dataclass
@ -119,6 +121,9 @@ class PdbFunctionExtractor:

        call_type = self._call_type_map[function_type["call_type"]]

+        # parse as hex number, default to 0
+        this_adjust = int(function_type.get("this_adjust", "0"), 16)
+
        return FunctionSignature(
            original_function_symbol=fn,
            call_type=call_type,
@ -126,6 +131,7 @@ class PdbFunctionExtractor:
            return_type=function_type["return_type"],
            class_type=class_type,
            stack_symbols=stack_symbols,
+            this_adjust=this_adjust,
        )

    def get_function_list(self) -> list[PdbFunction]:
--- a/tools/ghidra_scripts/lego_util/type_importer.py
+++ b/tools/ghidra_scripts/lego_util/type_importer.py
@ -1,5 +1,5 @@
 import logging
-from typing import Any, Callable, TypeVar
+from typing import Any, Callable, Iterator, Optional, TypeVar

 # Disable spurious warnings in vscode / pylance
 # pyright: reportMissingModuleSource=false
@ -7,6 +7,7 @@ from typing import Any, Callable, TypeVar
 # pylint: disable=too-many-return-statements # a `match` would be better, but for now we are stuck with Python 3.9
 # pylint: disable=no-else-return # Not sure why this rule even is a thing, this is great for checking exhaustiveness

+from isledecomp.cvdump.types import VirtualBasePointer
 from lego_util.exceptions import (
    ClassOrNamespaceNotFoundInGhidraError,
    TypeNotFoundError,
@ -15,7 +16,8 @@ from lego_util.exceptions import (
    StructModificationError,
 )
 from lego_util.ghidra_helper import (
-    add_pointer_type,
+    add_data_type_or_reuse_existing,
+    get_or_add_pointer_type,
    create_ghidra_namespace,
    get_ghidra_namespace,
    get_ghidra_type,
@ -33,6 +35,8 @@ from ghidra.program.model.data import (
    EnumDataType,
    StructureDataType,
    StructureInternal,
+    TypedefDataType,
+    ComponentOffsetSettingsDefinition,
 )
 from ghidra.util.task import ConsoleTaskMonitor

@ -56,10 +60,19 @@ class PdbTypeImporter:
    def types(self):
        return self.extraction.compare.cv.types

-    def import_pdb_type_into_ghidra(self, type_index: str) -> DataType:
+    def import_pdb_type_into_ghidra(
+        self, type_index: str, slim_for_vbase: bool = False
+    ) -> DataType:
        """
        Recursively imports a type from the PDB into Ghidra.
        @param type_index Either a scalar type like `T_INT4(...)` or a PDB reference like `0x10ba`
+        @param slim_for_vbase If true, the current invocation
+            imports a superclass of some class where virtual inheritance is involved (directly or indirectly).
+            This case requires special handling: Let's say we have `class C: B` and `class B: virtual A`. Then cvdump
+            reports a size for B that includes both B's fields as well as the A contained at an offset within B,
+            which is not the correct structure to be contained in C. Therefore, we need to create a "slim" version of B
+            that fits inside C.
+            This value should always be `False` when the referenced type is not (a pointer to) a class.
        """
        type_index_lower = type_index.lower()
        if type_index_lower.startswith("t_"):
@ -76,14 +89,19 @@ class PdbTypeImporter:

        # follow forward reference (class, struct, union)
        if type_pdb.get("is_forward_ref", False):
-            return self._import_forward_ref_type(type_index_lower, type_pdb)
+            return self._import_forward_ref_type(
+                type_index_lower, type_pdb, slim_for_vbase
+            )

        if type_category == "LF_POINTER":
-            return add_pointer_type(
-                self.api, self.import_pdb_type_into_ghidra(type_pdb["element_type"])
+            return get_or_add_pointer_type(
+                self.api,
+                self.import_pdb_type_into_ghidra(
+                    type_pdb["element_type"], slim_for_vbase
+                ),
            )
        elif type_category in ["LF_CLASS", "LF_STRUCTURE"]:
-            return self._import_class_or_struct(type_pdb)
+            return self._import_class_or_struct(type_pdb, slim_for_vbase)
        elif type_category == "LF_ARRAY":
            return self._import_array(type_pdb)
        elif type_category == "LF_ENUM":
@ -120,7 +138,10 @@ class PdbTypeImporter:
        return get_ghidra_type(self.api, scalar_cpp_type)

    def _import_forward_ref_type(
-        self, type_index, type_pdb: dict[str, Any]
+        self,
+        type_index,
+        type_pdb: dict[str, Any],
+        slim_for_vbase: bool = False,
    ) -> DataType:
        referenced_type = type_pdb.get("udt") or type_pdb.get("modifies")
        if referenced_type is None:
@ -136,7 +157,7 @@ class PdbTypeImporter:
            type_index,
            referenced_type,
        )
-        return self.import_pdb_type_into_ghidra(referenced_type)
+        return self.import_pdb_type_into_ghidra(referenced_type, slim_for_vbase)

    def _import_array(self, type_pdb: dict[str, Any]) -> DataType:
        inner_type = self.import_pdb_type_into_ghidra(type_pdb["array_type"])
@ -182,12 +203,18 @@ class PdbTypeImporter:

        return result

-    def _import_class_or_struct(self, type_in_pdb: dict[str, Any]) -> DataType:
+    def _import_class_or_struct(
+        self,
+        type_in_pdb: dict[str, Any],
+        slim_for_vbase: bool = False,
+    ) -> DataType:
        field_list_type: str = type_in_pdb["field_list_type"]
        field_list = self.types.keys[field_list_type.lower()]

        class_size: int = type_in_pdb["size"]
        class_name_with_namespace: str = sanitize_name(type_in_pdb["name"])
+        if slim_for_vbase:
+            class_name_with_namespace += "_vbase_slim"

        if class_name_with_namespace in self.handled_structs:
            logger.debug(
@ -205,11 +232,11 @@ class PdbTypeImporter:

        self._get_or_create_namespace(class_name_with_namespace)

-        data_type = self._get_or_create_struct_data_type(
+        new_ghidra_struct = self._get_or_create_struct_data_type(
            class_name_with_namespace, class_size
        )

-        if (old_size := data_type.getLength()) != class_size:
+        if (old_size := new_ghidra_struct.getLength()) != class_size:
            logger.warning(
                "Existing class %s had incorrect size %d. Setting to %d...",
                class_name_with_namespace,
@ -220,39 +247,189 @@ class PdbTypeImporter:
        logger.info("Adding class data type %s", class_name_with_namespace)
        logger.debug("Class information: %s", type_in_pdb)

-        data_type.deleteAll()
-        data_type.growStructure(class_size)
+        components: list[dict[str, Any]] = []
+        components.extend(self._get_components_from_base_classes(field_list))
+        # can be missing when no new fields are declared
+        components.extend(self._get_components_from_members(field_list))
+        components.extend(
+            self._get_components_from_vbase(
+                field_list, class_name_with_namespace, new_ghidra_struct
+            )
+        )
+
+        components.sort(key=lambda c: c["offset"])
+
+        if slim_for_vbase:
+            # Make a "slim" version: shrink the size to the fields that are actually present.
+            # This makes a difference when the current class uses virtual inheritance
+            assert (
+                len(components) > 0
+            ), f"Error: {class_name_with_namespace} should not be empty. There must be at least one direct or indirect vbase pointer."
+            last_component = components[-1]
+            class_size = last_component["offset"] + last_component["type"].getLength()
+
+        self._overwrite_struct(
+            class_name_with_namespace,
+            new_ghidra_struct,
+            class_size,
+            components,
+        )
+
+        logger.info("Finished importing class %s", class_name_with_namespace)
+
+        return new_ghidra_struct
+
+    def _get_components_from_base_classes(self, field_list) -> Iterator[dict[str, Any]]:
+        non_virtual_base_classes: dict[str, int] = field_list.get("super", {})
+
+        for super_type, offset in non_virtual_base_classes.items():
+            # If we have virtual inheritance _and_ a non-virtual base class here, we play safe and import slim version.
+            # This is technically not needed if only one of the superclasses uses virtual inheritance, but I am not aware of any instance.
+            import_slim_vbase_version_of_superclass = "vbase" in field_list
+            ghidra_type = self.import_pdb_type_into_ghidra(
+                super_type, slim_for_vbase=import_slim_vbase_version_of_superclass
+            )
+
+            yield {
+                "type": ghidra_type,
+                "offset": offset,
+                "name": "base" if offset == 0 else f"base_{ghidra_type.getName()}",
+            }
+
+    def _get_components_from_members(self, field_list: dict[str, Any]):
+        members: list[dict[str, Any]] = field_list.get("members") or []
+        for member in members:
+            yield member | {"type": self.import_pdb_type_into_ghidra(member["type"])}
+
+    def _get_components_from_vbase(
+        self,
+        field_list: dict[str, Any],
+        class_name_with_namespace: str,
+        current_type: StructureInternal,
+    ) -> Iterator[dict[str, Any]]:
+        vbasepointer: Optional[VirtualBasePointer] = field_list.get("vbase", None)
+
+        if vbasepointer is not None and any(x.direct for x in vbasepointer.bases):
+            vbaseptr_type = get_or_add_pointer_type(
+                self.api,
+                self._import_vbaseptr(
+                    current_type, class_name_with_namespace, vbasepointer
+                ),
+            )
+            yield {
+                "type": vbaseptr_type,
+                "offset": vbasepointer.vboffset,
+                "name": "vbase_offset",
+            }
+
+    def _import_vbaseptr(
+        self,
+        current_type: StructureInternal,
+        class_name_with_namespace: str,
+        vbasepointer: VirtualBasePointer,
+    ) -> StructureInternal:
+        pointer_size = 4  # hard-code to 4 because of 32 bit
+
+        components = [
+            {
+                "offset": 0,
+                "type": get_or_add_pointer_type(self.api, current_type),
+                "name": "o_self",
+            }
+        ]
+        for vbase in vbasepointer.bases:
+            vbase_ghidra_type = self.import_pdb_type_into_ghidra(vbase.type)
+
+            type_name = vbase_ghidra_type.getName()
+
+            vbase_ghidra_pointer = get_or_add_pointer_type(self.api, vbase_ghidra_type)
+            vbase_ghidra_pointer_typedef = TypedefDataType(
+                vbase_ghidra_pointer.getCategoryPath(),
+                f"{type_name}PtrOffset",
+                vbase_ghidra_pointer,
+            )
+            # Set a default value of -4 for the pointer offset. While this appears to be correct in many cases,
+            # it does not always lead to the best decompile. It can be fine-tuned by hand; the next function call
+            # makes sure that we don't overwrite this value on re-running the import.
+            ComponentOffsetSettingsDefinition.DEF.setValue(
+                vbase_ghidra_pointer_typedef.getDefaultSettings(), -4
+            )
+
+            vbase_ghidra_pointer_typedef = add_data_type_or_reuse_existing(
+                self.api, vbase_ghidra_pointer_typedef
+            )
+
+            components.append(
+                {
+                    "offset": vbase.index * pointer_size,
+                    "type": vbase_ghidra_pointer_typedef,
+                    "name": f"o_{type_name}",
+                }
+            )
+
+        size = len(components) * pointer_size
+
+        new_ghidra_struct = self._get_or_create_struct_data_type(
+            f"{class_name_with_namespace}::VBasePtr", size
+        )
+
+        self._overwrite_struct(
+            f"{class_name_with_namespace}::VBasePtr",
+            new_ghidra_struct,
+            size,
+            components,
+        )
+
+        return new_ghidra_struct
+
+    def _overwrite_struct(
+        self,
+        class_name_with_namespace: str,
+        new_ghidra_struct: StructureInternal,
+        class_size: int,
+        components: list[dict[str, Any]],
+    ):
+        new_ghidra_struct.deleteAll()
+        new_ghidra_struct.growStructure(class_size)

        # this case happened e.g. for IUnknown, which linked to an (incorrect) existing library, and some other types as well.
        # Unfortunately, we don't get proper error handling for read-only types.
        # However, we really do NOT want to do this every time because the type might be self-referential and partially imported.
-        if data_type.getLength() != class_size:
-            data_type = self._delete_and_recreate_struct_data_type(
-                class_name_with_namespace, class_size, data_type
+        if new_ghidra_struct.getLength() != class_size:
+            new_ghidra_struct = self._delete_and_recreate_struct_data_type(
+                class_name_with_namespace, class_size, new_ghidra_struct
            )

-        # can be missing when no new fields are declared
-        components: list[dict[str, Any]] = field_list.get("members") or []
-
-        super_type = field_list.get("super")
-        if super_type is not None:
-            components.insert(0, {"type": super_type, "offset": 0, "name": "base"})
-
        for component in components:
-            ghidra_type = self.import_pdb_type_into_ghidra(component["type"])
-            logger.debug("Adding component to class: %s", component)
+            offset: int = component["offset"]
+            logger.debug(
+                "Adding component %s to class: %s", component, class_name_with_namespace
+            )

            try:
-                # for better logs
-                data_type.replaceAtOffset(
-                    component["offset"], ghidra_type, -1, component["name"], None
+                # Make sure there is room for the new structure and that we have no collision.
+                existing_type = new_ghidra_struct.getComponentAt(offset)
+                assert (
+                    existing_type is not None
+                ), f"Struct collision: Offset {offset} in {class_name_with_namespace} is overlapped by another component"
+
+                if existing_type.getDataType().getName() != "undefined":
+                    # collision of structs beginning in the same place -> likely due to unions
+                    logger.warning(
+                        "Struct collision: Offset %d of %s already has a field (likely an inline union)",
+                        offset,
+                        class_name_with_namespace,
+                    )
+
+                new_ghidra_struct.replaceAtOffset(
+                    offset,
+                    component["type"],
+                    -1,  # set to -1 for fixed-size components
+                    component["name"],  # name
+                    None,  # comment
                )
            except Exception as e:
-                raise StructModificationError(type_in_pdb) from e
-
-        logger.info("Finished importing class %s", class_name_with_namespace)
-
-        return data_type
+                raise StructModificationError(class_name_with_namespace) from e

    def _get_or_create_namespace(self, class_name_with_namespace: str):
        colon_split = class_name_with_namespace.split("::")
--- a/tools/isledecomp/isledecomp/cvdump/types.py
+++ b/tools/isledecomp/isledecomp/cvdump/types.py
@ -1,3 +1,4 @@
+from dataclasses import dataclass
 import re
 import logging
 from typing import Any, Dict, List, NamedTuple, Optional
@ -26,6 +27,19 @@ class FieldListItem(NamedTuple):
    type: str


+@dataclass
+class VirtualBaseClass:
+    type: str
+    index: int
+    direct: bool
+
+
+@dataclass
+class VirtualBasePointer:
+    vboffset: int
+    bases: list[VirtualBaseClass]
+
+
 class ScalarType(NamedTuple):
    offset: int
    name: Optional[str]
@ -157,6 +171,16 @@ class CvdumpTypesParser:
        r"^\s+list\[\d+\] = LF_BCLASS, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
    )

+    # LF_FIELDLIST virtual direct/indirect base pointer, line 1/2
+    VBCLASS_RE = re.compile(
+        r"^\s+list\[\d+\] = LF_(?P<indirect>I?)VBCLASS, .* base type = (?P<type>.*)$"
+    )
+
+    # LF_FIELDLIST virtual direct/indirect base pointer, line 2/2
+    VBCLASS_LINE_2_RE = re.compile(
+        r"^\s+virtual base ptr = .+, vbpoff = (?P<vboffset>\d+), vbind = (?P<vbindex>\d+)$"
+    )
+
    # LF_FIELDLIST member name (2/2)
    MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")

@ -206,7 +230,7 @@ class CvdumpTypesParser:
        re.compile(r"\s*Arg list type = (?P<arg_list_type>[\w()]+)$"),
        re.compile(
            r"\s*This adjust = (?P<this_adjust>[\w()]+)$"
-        ),  # TODO: figure out the meaning
+        ),  # By how much the incoming pointers are shifted in virtual inheritance; hex value without `0x` prefix
        re.compile(
            r"\s*Func attr = (?P<func_attr>[\w()]+)$"
        ),  # Only for completeness, is always `none`
@ -282,12 +306,12 @@ class CvdumpTypesParser:

        members: List[FieldListItem] = []

-        super_id = field_obj.get("super")
-        if super_id is not None:
+        super_ids = field_obj.get("super", [])
+        for super_id in super_ids:
            # May need to resolve forward ref.
            superclass = self.get(super_id)
            if superclass.members is not None:
-                members = superclass.members
+                members += superclass.members

        raw_members = field_obj.get("members", [])
        members += [
@ -526,7 +550,57 @@ class CvdumpTypesParser:

        # Superclass is set here in the fieldlist rather than in LF_CLASS
        elif (match := self.SUPERCLASS_RE.match(line)) is not None:
-            self._set("super", normalize_type_id(match.group("type")))
+            superclass_list: dict[str, int] = self.keys[self.last_key].setdefault(
+                "super", {}
+            )
+            superclass_list[normalize_type_id(match.group("type"))] = int(
+                match.group("offset")
+            )
+
+        # virtual base class (direct or indirect)
+        elif (match := self.VBCLASS_RE.match(line)) is not None:
+            virtual_base_pointer = self.keys[self.last_key].setdefault(
+                "vbase",
+                VirtualBasePointer(
+                    vboffset=-1,  # default to -1 until we parse the correct value
+                    bases=[],
+                ),
+            )
+            assert isinstance(
+                virtual_base_pointer, VirtualBasePointer
+            )  # type checker only
+
+            virtual_base_pointer.bases.append(
+                VirtualBaseClass(
+                    type=match.group("type"),
+                    index=-1,  # default to -1 until we parse the correct value
+                    direct=match.group("indirect") != "I",
+                )
+            )
+
+        elif (match := self.VBCLASS_LINE_2_RE.match(line)) is not None:
+            virtual_base_pointer = self.keys[self.last_key].get("vbase", None)
+            assert isinstance(
+                virtual_base_pointer, VirtualBasePointer
+            ), "Parsed the second line of an (I)VBCLASS without the first one"
+            vboffset = int(match.group("vboffset"))
+
+            if virtual_base_pointer.vboffset == -1:
+                # default value
+                virtual_base_pointer.vboffset = vboffset
+            elif virtual_base_pointer.vboffset != vboffset:
+                # vboffset is always equal to 4 in our examples. We are not sure if there can be multiple
+                # virtual base pointers, and if so, how the layout is supposed to look.
+                # We therefore assume that there is always only one virtual base pointer.
+                logger.error(
+                    "Unhandled: Found multiple virtual base pointers at offsets %d and %d",
+                    virtual_base_pointer.vboffset,
+                    vboffset,
+                )
+
+            virtual_base_pointer.bases[-1].index = int(match.group("vbindex"))
+            # these come out of order, and the lists are so short that it's fine to sort them every time
+            virtual_base_pointer.bases.sort(key=lambda x: x.index)

        # Member offset and type given on the first of two lines.
        elif (match := self.LIST_RE.match(line)) is not None:
@ -579,7 +653,7 @@ class CvdumpTypesParser:
        else:
            logger.error("Unmatched line in arglist: %s", line[:-1])

-    def read_pointer_line(self, line):
+    def read_pointer_line(self, line: str):
        if (match := self.LF_POINTER_ELEMENT.match(line)) is not None:
            self._set("element_type", match.group("element_type"))
        else:
--- a/tools/isledecomp/tests/test_cvdump_types.py
+++ b/tools/isledecomp/tests/test_cvdump_types.py
@ -6,6 +6,9 @@ from isledecomp.cvdump.types import (
    CvdumpTypesParser,
    CvdumpKeyError,
    CvdumpIntegrityError,
+    FieldListItem,
+    VirtualBaseClass,
+    VirtualBasePointer,
 )

 TEST_LINES = """
@ -245,10 +248,111 @@ NESTED,     enum name = JukeBox::JukeBoxScript, UDT(0x00003cc2)
    list[12] = LF_MEMBER, private, type = T_USHORT(0021), offset = 12
        member name = 'm_length'

+
+0x4dee : Length = 406, Leaf = 0x1203 LF_FIELDLIST
+	list[0] = LF_VBCLASS, public, direct base type = 0x15EA
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
+	list[1] = LF_IVBCLASS, public, indirect base type = 0x1183
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
+	list[2] = LF_IVBCLASS, public, indirect base type = 0x1468
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
+	list[3] = LF_VFUNCTAB, type = 0x2B95
+	list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x15C2, name = 'LegoRaceMap'
+	list[5] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C3, name = '~LegoRaceMap'
+	list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C5, name = 'Notify'
+	list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C4, name = 'ParseAction'
+	list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x4DED, name = 'VTable0x70'
+	list[9] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15C2,
+		vfptr offset = 0, name = 'FUN_1005d4b0'
+	list[10] = LF_MEMBER, private, type = T_UCHAR(0020), offset = 8
+		member name = 'm_parentClass2Field1'
+	list[11] = LF_MEMBER, private, type = T_32PVOID(0403), offset = 12
+		member name = 'm_parentClass2Field2'
+
+0x4def : Length = 34, Leaf = 0x1504 LF_CLASS
+	# members = 21,  field list type 0x4dee, CONSTRUCTOR,
+	Derivation list type 0x0000, VT shape type 0x12a0
+	Size = 436, class name = LegoRaceMap, UDT(0x00004def)
+
 0x4db6 : Length = 30, Leaf = 0x1504 LF_CLASS
    # members = 16,  field list type 0x4db5, CONSTRUCTOR, OVERLOAD,
    Derivation list type 0x0000, VT shape type 0x1266
    Size = 16, class name = MxString, UDT(0x00004db6)
+
+0x5591 : Length = 570, Leaf = 0x1203 LF_FIELDLIST
+	list[0] = LF_VBCLASS, public, direct base type = 0x15EA
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
+	list[1] = LF_IVBCLASS, public, indirect base type = 0x1183
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
+	list[2] = LF_IVBCLASS, public, indirect base type = 0x1468
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
+	list[3] = LF_VFUNCTAB, type = 0x4E11
+	list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x1ABD, name = 'LegoCarRaceActor'
+	list[5] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1AE0, name = 'ClassName'
+	list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1AE1, name = 'IsA'
+	list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADD, name = 'VTable0x6c'
+	list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADB, name = 'VTable0x70'
+	list[9] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADA, name = 'SwitchBoundary'
+	list[10] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADC, name = 'VTable0x9c'
+	list[11] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x558E,
+		vfptr offset = 0, name = 'FUN_10080590'
+	list[12] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
+		vfptr offset = 4, name = 'FUN_10012bb0'
+	list[13] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
+		vfptr offset = 8, name = 'FUN_10012bc0'
+	list[14] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
+		vfptr offset = 12, name = 'FUN_10012bd0'
+	list[15] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
+		vfptr offset = 16, name = 'FUN_10012be0'
+	list[16] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
+		vfptr offset = 20, name = 'FUN_10012bf0'
+	list[17] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
+		vfptr offset = 24, name = 'FUN_10012c00'
+	list[18] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1ABD,
+		vfptr offset = 28, name = 'VTable0x1c'
+	list[19] = LF_MEMBER, protected, type = T_REAL32(0040), offset = 8
+		member name = 'm_parentClass1Field1'
+	list[25] = LF_ONEMETHOD, public, VIRTUAL, (compgenx), index = 0x15D1, name = '~LegoCarRaceActor'
+
+0x5592 : Length = 38, Leaf = 0x1504 LF_CLASS
+	# members = 26,  field list type 0x5591, CONSTRUCTOR,
+	Derivation list type 0x0000, VT shape type 0x34c7
+	Size = 416, class name = LegoCarRaceActor, UDT(0x00005592)
+
+0x5593 : Length = 638, Leaf = 0x1203 LF_FIELDLIST
+	list[0] = LF_BCLASS, public, type = 0x5592, offset = 0
+	list[1] = LF_BCLASS, public, type = 0x4DEF, offset = 32
+	list[2] = LF_IVBCLASS, public, indirect base type = 0x1183
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
+	list[3] = LF_IVBCLASS, public, indirect base type = 0x1468
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
+	list[4] = LF_IVBCLASS, public, indirect base type = 0x15EA
+		virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
+	list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x15CD, name = 'LegoRaceCar'
+	list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15CE, name = '~LegoRaceCar'
+	list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D2, name = 'Notify'
+	list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E8, name = 'ClassName'
+	list[9] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E9, name = 'IsA'
+	list[10] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D5, name = 'ParseAction'
+	list[11] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D3, name = 'SetWorldSpeed'
+	list[12] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DF, name = 'VTable0x6c'
+	list[13] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D3, name = 'VTable0x70'
+	list[14] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DC, name = 'VTable0x94'
+	list[15] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E5, name = 'SwitchBoundary'
+	list[16] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DD, name = 'VTable0x9c'
+	list[17] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15D4,
+		vfptr offset = 32, name = 'SetMaxLinearVelocity'
+	list[18] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15D4,
+		vfptr offset = 36, name = 'FUN_10012ff0'
+	list[19] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x5588,
+		vfptr offset = 40, name = 'HandleSkeletonKicks'
+	list[20] = LF_MEMBER, private, type = T_UCHAR(0020), offset = 84
+		member name = 'm_childClassField'
+
+0x5594 : Length = 34, Leaf = 0x1504 LF_CLASS
+	# members = 30,  field list type 0x5593, CONSTRUCTOR,
+	Derivation list type 0x0000, VT shape type 0x2d1e
+	Size = 512, class name = LegoRaceCar, UDT(0x000055bb)
 """


@ -309,6 +413,31 @@ def test_members(parser: CvdumpTypesParser):
        (12, "m_length", "T_USHORT"),
    ]

+    # LegoRaceCar with multiple superclasses
+    assert parser.get("0x5594").members == [
+        FieldListItem(offset=0, name="vftable", type="T_32PVOID"),
+        FieldListItem(offset=0, name="vftable", type="T_32PVOID"),
+        FieldListItem(offset=8, name="m_parentClass1Field1", type="T_REAL32"),
+        FieldListItem(offset=8, name="m_parentClass2Field1", type="T_UCHAR"),
+        FieldListItem(offset=12, name="m_parentClass2Field2", type="T_32PVOID"),
+        FieldListItem(offset=84, name="m_childClassField", type="T_UCHAR"),
+    ]
+
+
+def test_virtual_base_classes(parser: CvdumpTypesParser):
+    """Make sure that virtual base classes are parsed correctly."""
+
+    lego_car_race_actor = parser.keys.get("0x5591")
+    assert lego_car_race_actor is not None
+    assert lego_car_race_actor["vbase"] == VirtualBasePointer(
+        vboffset=4,
+        bases=[
+            VirtualBaseClass(type="0x1183", index=1, direct=False),
+            VirtualBaseClass(type="0x1468", index=2, direct=False),
+            VirtualBaseClass(type="0x15EA", index=3, direct=True),
+        ],
+    )
+

 def test_members_recursive(parser: CvdumpTypesParser):
    """Make sure that we unwrap the dependency tree correctly."""