Use reccmp as a python requirement (#1116)
* Use reccmp as a python requirement * Add BETA10 to reccmp-project.yml
8
.github/workflows/analyze.yml
vendored
@ -17,10 +17,14 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install python libraries
|
||||
run: |
|
||||
python -m pip install -r tools/requirements.txt
|
||||
pip install -r tools/requirements.txt
|
||||
|
||||
- name: Run decomplint.py
|
||||
run: |
|
||||
tools/decomplint/decomplint.py ${{ matrix.who }} --module ${{ matrix.who }} --warnfail
|
||||
reccmp-decomplint ${{ matrix.who }} --module ${{ matrix.who }} --warnfail
|
||||
|
29
.github/workflows/build.yml
vendored
@ -107,6 +107,10 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@master
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- uses: actions/download-artifact@master
|
||||
with:
|
||||
name: Win32
|
||||
@ -125,12 +129,17 @@ jobs:
|
||||
run: |
|
||||
pip install -r tools/requirements.txt
|
||||
|
||||
- name: Detect binaries
|
||||
run: |
|
||||
reccmp-project detect --what original --search-path legobin
|
||||
reccmp-project detect --what recompiled --search-path build
|
||||
|
||||
- name: Summarize Accuracy
|
||||
shell: bash
|
||||
run: |
|
||||
python3 tools/reccmp/reccmp.py -S CONFIGPROGRESS.SVG --svg-icon tools/reccmp/config.png -H CONFIGPROGRESS.HTML legobin/CONFIG.EXE build/CONFIG.EXE build/CONFIG.PDB . | tee CONFIGPROGRESS.TXT
|
||||
python3 tools/reccmp/reccmp.py -S ISLEPROGRESS.SVG --svg-icon tools/reccmp/isle.png -H ISLEPROGRESS.HTML legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB . | tee ISLEPROGRESS.TXT
|
||||
python3 tools/reccmp/reccmp.py -S LEGO1PROGRESS.SVG -T 4252 --svg-icon tools/reccmp/lego1.png -H LEGO1PROGRESS.HTML legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB . | tee LEGO1PROGRESS.TXT
|
||||
reccmp-reccmp -S CONFIGPROGRESS.SVG --svg-icon assets/config.png --target CONFIG | tee CONFIGPROGRESS.TXT
|
||||
reccmp-reccmp -S ISLEPROGRESS.SVG --svg-icon assets/isle.png --target ISLE | tee ISLEPROGRESS.TXT
|
||||
reccmp-reccmp -S LEGO1PROGRESS.SVG -T 4252 --svg-icon assets/lego1.png --target LEGO1 | tee LEGO1PROGRESS.TXT
|
||||
|
||||
- name: Compare Accuracy With Current Master
|
||||
shell: bash
|
||||
@ -147,21 +156,21 @@ jobs:
|
||||
- name: Test Exports
|
||||
shell: bash
|
||||
run: |
|
||||
tools/verexp/verexp.py legobin/LEGO1.DLL build/LEGO1.DLL
|
||||
reccmp-verexp --target LEGO1
|
||||
|
||||
- name: Check Vtables
|
||||
shell: bash
|
||||
run: |
|
||||
python3 tools/vtable/vtable.py legobin/CONFIG.EXE build/CONFIG.EXE build/CONFIG.PDB .
|
||||
python3 tools/vtable/vtable.py legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB .
|
||||
python3 tools/vtable/vtable.py legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .
|
||||
reccmp-vtable --target CONFIG
|
||||
reccmp-vtable --target ISLE
|
||||
reccmp-vtable --target LEGO1
|
||||
|
||||
- name: Check Variables
|
||||
shell: bash
|
||||
run: |
|
||||
python3 tools/datacmp.py legobin/CONFIG.EXE build/CONFIG.EXE build/CONFIG.PDB .
|
||||
python3 tools/datacmp.py legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB .
|
||||
python3 tools/datacmp.py legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .
|
||||
reccmp-datacmp --target CONFIG
|
||||
reccmp-datacmp --target ISLE
|
||||
reccmp-datacmp --target LEGO1
|
||||
|
||||
- name: Upload Artifact
|
||||
uses: actions/upload-artifact@master
|
||||
|
37
.github/workflows/format.yml
vendored
@ -1,37 +0,0 @@
|
||||
name: Format
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
clang-format:
|
||||
name: 'C++'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Run clang-format
|
||||
run: |
|
||||
find CONFIG LEGO1 ISLE -iname '*.h' -o -iname '*.cpp' | xargs \
|
||||
pipx run "clang-format>=17,<18" \
|
||||
--style=file \
|
||||
-i
|
||||
git diff --exit-code
|
||||
|
||||
python-format:
|
||||
name: 'Python'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install python libraries
|
||||
shell: bash
|
||||
run: |
|
||||
pip install black==23.* pylint==3.2.7 pytest==7.* -r tools/requirements.txt
|
||||
|
||||
- name: Run pylint and black
|
||||
shell: bash
|
||||
run: |
|
||||
pylint tools --ignore=build,ncc
|
||||
black --check tools --exclude=ncc
|
4
.github/workflows/naming.yml
vendored
@ -15,6 +15,10 @@ jobs:
|
||||
with:
|
||||
version: "16"
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install python libraries
|
||||
run: |
|
||||
pip install -r tools/requirements.txt
|
||||
|
60
.github/workflows/unittest.yml
vendored
@ -1,60 +0,0 @@
|
||||
name: Test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
fetch-deps:
|
||||
name: Download original binaries
|
||||
uses: ./.github/workflows/legobin.yml
|
||||
|
||||
pytest-win:
|
||||
name: 'Python Windows'
|
||||
runs-on: windows-latest
|
||||
needs: fetch-deps
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Restore cached original binaries
|
||||
id: cache-original-binaries
|
||||
uses: actions/cache/restore@v3
|
||||
with:
|
||||
enableCrossOsArchive: true
|
||||
path: legobin
|
||||
key: legobin
|
||||
|
||||
- name: Install python libraries
|
||||
shell: bash
|
||||
run: |
|
||||
pip install pytest -r tools/requirements.txt
|
||||
|
||||
- name: Run python unit tests (Windows)
|
||||
shell: bash
|
||||
run: |
|
||||
pytest tools/isledecomp --lego1=legobin/LEGO1.DLL
|
||||
|
||||
pytest-ubuntu:
|
||||
name: 'Python Linux'
|
||||
runs-on: ubuntu-latest
|
||||
needs: fetch-deps
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Restore cached original binaries
|
||||
id: cache-original-binaries
|
||||
uses: actions/cache/restore@v3
|
||||
with:
|
||||
enableCrossOsArchive: true
|
||||
path: legobin
|
||||
key: legobin
|
||||
|
||||
- name: Install python libraries
|
||||
shell: bash
|
||||
run: |
|
||||
pip install pytest -r tools/requirements.txt
|
||||
|
||||
- name: Run python unit tests (Ubuntu)
|
||||
shell: bash
|
||||
run: |
|
||||
pytest tools/isledecomp --lego1=legobin/LEGO1.DLL
|
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
reccmp-user.yml
|
||||
reccmp-build.yml
|
||||
Debug/
|
||||
Release/
|
||||
*.ncb
|
||||
|
@ -8,6 +8,7 @@ project(isle CXX)
|
||||
include(CheckCXXSourceCompiles)
|
||||
include(CMakeDependentOption)
|
||||
include(CMakePushCheckState)
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/cmake/reccmp.cmake")
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE)
|
||||
option(ENABLE_CLANG_TIDY "Enable clang-tidy")
|
||||
@ -405,6 +406,7 @@ add_library(lego1 SHARED
|
||||
LEGO1/main.cpp
|
||||
LEGO1/modeldb/modeldb.cpp
|
||||
)
|
||||
reccmp_add_target(lego1 ID LEGO1)
|
||||
register_lego1_target(lego1)
|
||||
|
||||
if (MINGW)
|
||||
@ -447,6 +449,7 @@ if (ISLE_BUILD_APP)
|
||||
ISLE/res/isle.rc
|
||||
ISLE/isleapp.cpp
|
||||
)
|
||||
reccmp_add_target(isle ID ISLE)
|
||||
|
||||
target_compile_definitions(isle PRIVATE ISLE_APP)
|
||||
|
||||
@ -477,6 +480,7 @@ if (ISLE_BUILD_CONFIG)
|
||||
CONFIG/StdAfx.cpp
|
||||
CONFIG/res/config.rc
|
||||
)
|
||||
reccmp_add_target(config ID CONFIG)
|
||||
target_compile_definitions(config PRIVATE _AFXDLL MXDIRECTX_FOR_CONFIG)
|
||||
target_include_directories(config PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/util" "${CMAKE_CURRENT_SOURCE_DIR}/LEGO1")
|
||||
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14)
|
||||
@ -603,3 +607,5 @@ if(EXISTS "${CLANGFORMAT_BIN}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
reccmp_configure()
|
||||
|
@ -62,3 +62,4 @@ Right click on `LEGO1.DLL`, select `Properties`, and switch to the `Details` tab
|
||||
|
||||
* ISLE.EXE `md5: f6da12249e03eed1c74810cd23beb9f5`
|
||||
* LEGO1.DLL `md5: 4e2f6d969ea2ef8655ba3fc221a0c8fe`
|
||||
* CONFIG.DLL `md5: 92d958a64a273662c591c88b09100f4a`
|
||||
|
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.4 KiB |
Before Width: | Height: | Size: 5.3 KiB After Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 5.5 KiB After Width: | Height: | Size: 5.5 KiB |
58
cmake/reccmp.cmake
Normal file
@ -0,0 +1,58 @@
|
||||
function(reccmp_find_project RESULT)
|
||||
set(curdir "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
while(1)
|
||||
if(EXISTS "${curdir}/reccmp-project.yml")
|
||||
break()
|
||||
endif()
|
||||
get_filename_component(nextdir "${curdir}" DIRECTORY)
|
||||
if(nextdir STREQUAL curdir)
|
||||
set(curdir "${RESULT}-NOTFOUND")
|
||||
break()
|
||||
endif()
|
||||
set(curdir "${nextdir}")
|
||||
endwhile()
|
||||
set("${RESULT}" "${curdir}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(reccmp_add_target TARGET)
|
||||
cmake_parse_arguments(ARGS "" "ID" "" ${ARGN})
|
||||
if(NOT ARGS_ID)
|
||||
message(FATAL_ERROR "Missing ID argument")
|
||||
endif()
|
||||
set_property(TARGET ${TARGET} PROPERTY INTERFACE_RECCMP_ID "${ARGS_ID}")
|
||||
set_property(GLOBAL APPEND PROPERTY RECCMP_TARGETS ${TARGET})
|
||||
endfunction()
|
||||
|
||||
function(reccmp_configure)
|
||||
cmake_parse_arguments(ARGS "COPY_TO_SOURCE_FOLDER" "DIR" "" ${ARGN})
|
||||
set(binary_dir "${CMAKE_BINARY_DIR}")
|
||||
if(ARGS_DIR)
|
||||
set(binary_dir "${ARGS_DIR}")
|
||||
endif()
|
||||
|
||||
reccmp_find_project(reccmp_project_dir)
|
||||
if(NOT reccmp_project_dir)
|
||||
message(FATAL_ERROR "Cannot find reccmp-project.yml")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CONFIGURATION_TYPES)
|
||||
set(outputdir "${binary_dir}/$<CONFIG>")
|
||||
else()
|
||||
set(outputdir "${binary_dir}")
|
||||
endif()
|
||||
set(build_yml_txt "project: '${reccmp_project_dir}'\ntargets:\n")
|
||||
get_property(RECCMP_TARGETS GLOBAL PROPERTY RECCMP_TARGETS)
|
||||
foreach(target ${RECCMP_TARGETS})
|
||||
get_property(id TARGET "${target}" PROPERTY INTERFACE_RECCMP_ID)
|
||||
string(APPEND build_yml_txt " ${id}:\n")
|
||||
string(APPEND build_yml_txt " path: '$<TARGET_FILE:${target}>'\n")
|
||||
if(WIN32 AND MSVC)
|
||||
string(APPEND build_yml_txt " pdb: '$<TARGET_PDB_FILE:${target}>'\n")
|
||||
endif()
|
||||
endforeach()
|
||||
file(GENERATE OUTPUT "${outputdir}/reccmp-build.yml" CONTENT "${build_yml_txt}")
|
||||
|
||||
if(ARGS_COPY_TO_SOURCE_FOLDER)
|
||||
file(GENERATE OUTPUT "${CMAKE_SOURCE_DIR}/reccmp-build.yml" CONTENT "${build_yml_txt}" CONDITION $<CONFIG:Release>)
|
||||
endif()
|
||||
endfunction()
|
21
reccmp-project.yml
Normal file
@ -0,0 +1,21 @@
|
||||
targets:
|
||||
ISLE:
|
||||
filename: ISLE.EXE
|
||||
source-root: ISLE
|
||||
hash:
|
||||
sha256: 5cf57c284973fce9d14f5677a2e4435fd989c5e938970764d00c8932ed5128ca
|
||||
LEGO1:
|
||||
filename: LEGO1.DLL
|
||||
source-root: LEGO1
|
||||
hash:
|
||||
sha256: 14645225bbe81212e9bc1919cd8a692b81b8622abb6561280d99b0fc4151ce17
|
||||
CONFIG:
|
||||
filename: CONFIG.EXE
|
||||
source-root: CONFIG
|
||||
hash:
|
||||
sha256: 864766d024d78330fed5e1f6efb2faf815f1b1c3405713a9718059dc9a54e52c
|
||||
BETA10:
|
||||
filename: BETA10.DLL
|
||||
source-root: LEGO1
|
||||
hash:
|
||||
sha256: d91435a40fa31f405fba33b03bd3bd40dcd4ca36ccf8ef6162c6c5ca0d7190e7
|
@ -160,58 +160,42 @@ inline virtual const char* ClassName() const override // vtable+0x0c
|
||||
|
||||
Use `pip` to install the required packages to be able to use the Python tools found in this folder:
|
||||
|
||||
```
|
||||
```sh
|
||||
pip install -r tools/requirements.txt
|
||||
```
|
||||
|
||||
Run the following command to allow reccmp to detect the original LEGO binaries:
|
||||
|
||||
```sh
|
||||
reccmp-project detect --what original --search-path <paths-to-directories0containing-lego-binaries>
|
||||
```
|
||||
|
||||
After building recompiled binaries, run the following command in this repository's root:
|
||||
|
||||
```sh
|
||||
reccmp-project detect --what recompiled --search-path <paths-to-build-directories>
|
||||
```
|
||||
|
||||
The example usages below assume that the current working directory is this repository's root and that the retail binaries have been copied to `./legobin`.
|
||||
|
||||
* [`decomplint`](/tools/decomplint): Checks the decompilation annotations (see above)
|
||||
* e.g. `py -m tools.decomplint.decomplint --module LEGO1 LEGO1`
|
||||
* [`isledecomp`](/tools/isledecomp): A library that implements a parser to identify the decompilation annotations (see above)
|
||||
* `reccmp-decomplint`: Checks the decompilation annotations (see above)
|
||||
* e.g. `reccmp-decomplint --module LEGO1 LEGO1`
|
||||
* [`ncc`](/tools/ncc): Checks naming conventions based on a set of rules
|
||||
* [`reccmp`](/tools/reccmp): Compares an original binary with a recompiled binary, provided a PDB file. For example:
|
||||
* `reccmp-reccmp`: Compares an original binary with a recompiled binary, provided a PDB file. For example:
|
||||
* Display the diff for a single function: `py -m tools.reccmp.reccmp --verbose 0x100ae1a0 legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* Generate an HTML report: `py -m tools.reccmp.reccmp --html output.html legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* Create a base file for diffs: `py -m tools.reccmp.reccmp --json base.json --silent legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* Diff against a base file: `py -m tools.reccmp.reccmp --diff base.json legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* [`stackcmp`](/tools/stackcmp): Compares the stack layout for a given function that almost matches.
|
||||
* e.g. `py -m tools.stackcmp.stackcmp legobin/BETA10.DLL build_debug/LEGO1.DLL build_debug/LEGO1.pdb . 0x1007165d`
|
||||
* [`roadmap`](/tools/roadmap): Compares symbol locations in an original binary with the same symbol locations of a recompiled binary
|
||||
* [`verexp`](/tools/verexp): Verifies exports by comparing the exports of the original DLL and the recompiled DLL
|
||||
* [`vtable`](/tools/vtable): Asserts virtual table correctness by comparing a recompiled binary with the original
|
||||
* e.g. `py -m tools.vtable.vtable legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* [`datacmp.py`](/tools/datacmp.py): Compares global data found in the original with the recompiled version
|
||||
* e.g. `py -m tools.datacmp legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* `reccmp-stackcmp`: Compares the stack layout for a given function that almost matches.
|
||||
* e.g. `reccmp-stackcmp legobin/BETA10.DLL build_debug/LEGO1.DLL build_debug/LEGO1.pdb . 0x1007165d`
|
||||
* `reccmp-roadmap`: Compares symbol locations in an original binary with the same symbol locations of a recompiled binary
|
||||
* `reccmp-verexp`: Verifies exports by comparing the exports of the original DLL and the recompiled DLL
|
||||
* `reccmp-vtable`: Asserts virtual table correctness by comparing a recompiled binary with the original
|
||||
* e.g. `reccmp-vtable legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* `reccmp-datacmp`: Compares global data found in the original with the recompiled version
|
||||
* e.g. `reccmp-datacmp legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
|
||||
* [`patch_c2.py`](/tools/patch_c2.py): Patches `C2.EXE` (part of MSVC 4.20) to get rid of a bugged warning
|
||||
|
||||
## Testing
|
||||
|
||||
`isledecomp` comes with a suite of tests. Install `pytest` and run it, passing in the directory:
|
||||
|
||||
```
|
||||
pip install pytest
|
||||
pytest tools/isledecomp/tests/
|
||||
```
|
||||
|
||||
## Tool Development
|
||||
|
||||
In order to keep the Python code clean and consistent, we use `pylint` and `black`:
|
||||
|
||||
`pip install black pylint`
|
||||
|
||||
### Run pylint (ignores build and virtualenv)
|
||||
|
||||
`pylint tools/ --ignore=build,ncc`
|
||||
|
||||
### Check Python code formatting without rewriting files
|
||||
|
||||
`black --check tools/`
|
||||
|
||||
### Apply Python code formatting
|
||||
|
||||
`black tools/`
|
||||
|
||||
# Modules
|
||||
The following is a list of all the modules found in the annotations (e.g. `// FUNCTION: [module] [address]`) and which binaries they refer to. See [this list of all known versions of the game](https://www.legoisland.org/wiki/LEGO_Island#Download).
|
||||
|
||||
@ -243,7 +227,7 @@ cmake <path-to-source> -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=RelWithDebInfo -D
|
||||
```
|
||||
**TODO**: If you can figure out how to make a debug build with SmartHeap enabled, please add it here.
|
||||
|
||||
If you want to run scripts to compare your debug build to `BETA10` (e.g. `reccmp`), it is advisable to add a copy of `LEGO1D.DLL` to `/legobin` and rename it to `BETA10.DLL`.
|
||||
If you want to run scripts to compare your debug build to `BETA10` (e.g. `reccmp-reccmp`), it is advisable to add a copy of `LEGO1D.DLL` to `/legobin` and rename it to `BETA10.DLL`.
|
||||
|
||||
### Finding matching functions
|
||||
|
||||
|
371
tools/datacmp.py
@ -1,371 +0,0 @@
|
||||
# (New) Data comparison.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import Iterable, List, NamedTuple, Optional, Tuple
|
||||
from struct import unpack
|
||||
from isledecomp.compare import Compare as IsleCompare
|
||||
from isledecomp.compare.db import MatchInfo
|
||||
from isledecomp.cvdump import Cvdump
|
||||
from isledecomp.cvdump.types import (
|
||||
CvdumpKeyError,
|
||||
CvdumpIntegrityError,
|
||||
)
|
||||
from isledecomp.bin import Bin as IsleBin
|
||||
import colorama
|
||||
|
||||
colorama.just_fix_windows_console()
|
||||
|
||||
|
||||
# Ignore all compare-db messages.
|
||||
logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Comparing data values.")
|
||||
parser.add_argument(
|
||||
"original", metavar="original-binary", help="The original binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-color", "-n", action="store_true", help="Do not color the output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all",
|
||||
"-a",
|
||||
dest="show_all",
|
||||
action="store_true",
|
||||
help="Only show variables with a problem",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-rec-addr",
|
||||
action="store_true",
|
||||
help="Print addresses of recompiled functions too",
|
||||
)
|
||||
|
||||
(args, _) = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.original):
|
||||
parser.error(f"Original binary {args.original} does not exist")
|
||||
|
||||
if not os.path.isfile(args.recompiled):
|
||||
parser.error(f"Recompiled binary {args.recompiled} does not exist")
|
||||
|
||||
if not os.path.isfile(args.pdb):
|
||||
parser.error(f"Symbols PDB {args.pdb} does not exist")
|
||||
|
||||
if not os.path.isdir(args.decomp_dir):
|
||||
parser.error(f"Source directory {args.decomp_dir} does not exist")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class CompareResult(Enum):
|
||||
MATCH = 1
|
||||
DIFF = 2
|
||||
ERROR = 3
|
||||
WARN = 4
|
||||
|
||||
|
||||
class ComparedOffset(NamedTuple):
|
||||
offset: int
|
||||
# name is None for scalar types
|
||||
name: Optional[str]
|
||||
match: bool
|
||||
values: Tuple[str, str]
|
||||
|
||||
|
||||
class ComparisonItem(NamedTuple):
|
||||
"""Each variable that was compared"""
|
||||
|
||||
orig_addr: int
|
||||
recomp_addr: int
|
||||
name: str
|
||||
|
||||
# The list of items that were compared.
|
||||
# For a complex type, these are the members.
|
||||
# For a scalar type, this is a list of size one.
|
||||
# If we could not retrieve type information, this is
|
||||
# a list of size one but without any specific type.
|
||||
compared: List[ComparedOffset]
|
||||
|
||||
# If present, the error message from the types parser.
|
||||
error: Optional[str] = None
|
||||
|
||||
# If true, there is no type specified for this variable. (i.e. non-public)
|
||||
# In this case, we can only compare the raw bytes.
|
||||
# This is different from the situation where a type id _is_ given, but
|
||||
# we could not retrieve it for some reason. (This is an error.)
|
||||
raw_only: bool = False
|
||||
|
||||
@property
|
||||
def result(self) -> CompareResult:
|
||||
if self.error is not None:
|
||||
return CompareResult.ERROR
|
||||
|
||||
if all(c.match for c in self.compared):
|
||||
return CompareResult.MATCH
|
||||
|
||||
# Prefer WARN for a diff without complete type information.
|
||||
return CompareResult.WARN if self.raw_only else CompareResult.DIFF
|
||||
|
||||
|
||||
def create_comparison_item(
|
||||
var: MatchInfo,
|
||||
compared: Optional[List[ComparedOffset]] = None,
|
||||
error: Optional[str] = None,
|
||||
raw_only: bool = False,
|
||||
) -> ComparisonItem:
|
||||
"""Helper to create the ComparisonItem from the fields in MatchInfo."""
|
||||
if compared is None:
|
||||
compared = []
|
||||
|
||||
return ComparisonItem(
|
||||
orig_addr=var.orig_addr,
|
||||
recomp_addr=var.recomp_addr,
|
||||
name=var.name,
|
||||
compared=compared,
|
||||
error=error,
|
||||
raw_only=raw_only,
|
||||
)
|
||||
|
||||
|
||||
def do_the_comparison(args: argparse.Namespace) -> Iterable[ComparisonItem]:
|
||||
"""Run through each variable in our compare DB, then do the comparison
|
||||
according to the variable's type. Emit the result."""
|
||||
with IsleBin(args.original, find_str=True) as origfile, IsleBin(
|
||||
args.recompiled
|
||||
) as recompfile:
|
||||
isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)
|
||||
|
||||
# TODO: We don't currently retain the type information of each variable
|
||||
# in our compare DB. To get those, we build this mini-lookup table that
|
||||
# maps recomp addresses to their type.
|
||||
# We still need to build the full compare DB though, because we may
|
||||
# need the matched symbols to compare pointers (e.g. on strings)
|
||||
mini_cvdump = Cvdump(args.pdb).globals().types().run()
|
||||
|
||||
recomp_type_reference = {
|
||||
recompfile.get_abs_addr(g.section, g.offset): g.type
|
||||
for g in mini_cvdump.globals
|
||||
if recompfile.is_valid_section(g.section)
|
||||
}
|
||||
|
||||
for var in isle_compare.get_variables():
|
||||
type_name = recomp_type_reference.get(var.recomp_addr)
|
||||
|
||||
# Start by assuming we can only compare the raw bytes
|
||||
data_size = var.size
|
||||
is_type_aware = type_name is not None
|
||||
|
||||
if is_type_aware:
|
||||
try:
|
||||
# If we are type-aware, we can get the precise
|
||||
# data size for the variable.
|
||||
data_type = mini_cvdump.types.get(type_name)
|
||||
data_size = data_type.size
|
||||
except (CvdumpKeyError, CvdumpIntegrityError) as ex:
|
||||
yield create_comparison_item(var, error=repr(ex))
|
||||
continue
|
||||
|
||||
orig_raw = origfile.read(var.orig_addr, data_size)
|
||||
recomp_raw = recompfile.read(var.recomp_addr, data_size)
|
||||
|
||||
# The IMAGE_SECTION_HEADER defines the SizeOfRawData and VirtualSize for the section.
|
||||
# If VirtualSize > SizeOfRawData, the section is comprised of the initialized data
|
||||
# corresponding to bytes in the file, and the rest is padded with zeroes when
|
||||
# Windows loads the image.
|
||||
# The linker might place variables initialized to zero on the threshold between
|
||||
# physical data and the virtual (uninitialized) data.
|
||||
# If this happens (i.e. we get an incomplete read) we just do the same padding
|
||||
# to prepare for the comparison.
|
||||
if orig_raw is not None and len(orig_raw) < data_size:
|
||||
orig_raw = orig_raw.ljust(data_size, b"\x00")
|
||||
|
||||
if recomp_raw is not None and len(recomp_raw) < data_size:
|
||||
recomp_raw = recomp_raw.ljust(data_size, b"\x00")
|
||||
|
||||
# If one or both variables are entirely uninitialized
|
||||
if orig_raw is None or recomp_raw is None:
|
||||
# If both variables are uninitialized, we consider them equal.
|
||||
match = orig_raw is None and recomp_raw is None
|
||||
|
||||
# We can match a variable initialized to all zeroes with
|
||||
# an uninitialized variable, but this may or may not actually
|
||||
# be correct, so we flag it for the user.
|
||||
uninit_force_match = not match and (
|
||||
(orig_raw is None and all(b == 0 for b in recomp_raw))
|
||||
or (recomp_raw is None and all(b == 0 for b in orig_raw))
|
||||
)
|
||||
|
||||
orig_value = "(uninitialized)" if orig_raw is None else "(initialized)"
|
||||
recomp_value = (
|
||||
"(uninitialized)" if recomp_raw is None else "(initialized)"
|
||||
)
|
||||
yield create_comparison_item(
|
||||
var,
|
||||
compared=[
|
||||
ComparedOffset(
|
||||
offset=0,
|
||||
name=None,
|
||||
match=match,
|
||||
values=(orig_value, recomp_value),
|
||||
)
|
||||
],
|
||||
raw_only=uninit_force_match,
|
||||
)
|
||||
continue
|
||||
|
||||
if not is_type_aware:
|
||||
# If there is no specific type information available
|
||||
# (i.e. if this is a static or non-public variable)
|
||||
# then we can only compare the raw bytes.
|
||||
yield create_comparison_item(
|
||||
var,
|
||||
compared=[
|
||||
ComparedOffset(
|
||||
offset=0,
|
||||
name="(raw)",
|
||||
match=orig_raw == recomp_raw,
|
||||
values=(orig_raw, recomp_raw),
|
||||
)
|
||||
],
|
||||
raw_only=True,
|
||||
)
|
||||
continue
|
||||
|
||||
# If we are here, we can do the type-aware comparison.
|
||||
compared = []
|
||||
compare_items = mini_cvdump.types.get_scalars_gapless(type_name)
|
||||
format_str = mini_cvdump.types.get_format_string(type_name)
|
||||
|
||||
orig_data = unpack(format_str, orig_raw)
|
||||
recomp_data = unpack(format_str, recomp_raw)
|
||||
|
||||
def pointer_display(addr: int, is_orig: bool) -> str:
|
||||
"""Helper to streamline pointer textual display."""
|
||||
if addr == 0:
|
||||
return "nullptr"
|
||||
|
||||
ptr_match = (
|
||||
isle_compare.get_by_orig(addr)
|
||||
if is_orig
|
||||
else isle_compare.get_by_recomp(addr)
|
||||
)
|
||||
|
||||
if ptr_match is not None:
|
||||
return f"Pointer to {ptr_match.match_name()}"
|
||||
|
||||
# This variable did not match if we do not have
|
||||
# the pointer target in our DB.
|
||||
return f"Unknown pointer 0x{addr:x}"
|
||||
|
||||
# Could zip here
|
||||
for i, member in enumerate(compare_items):
|
||||
if member.is_pointer:
|
||||
match = isle_compare.is_pointer_match(orig_data[i], recomp_data[i])
|
||||
|
||||
value_a = pointer_display(orig_data[i], True)
|
||||
value_b = pointer_display(recomp_data[i], False)
|
||||
|
||||
values = (value_a, value_b)
|
||||
else:
|
||||
match = orig_data[i] == recomp_data[i]
|
||||
values = (orig_data[i], recomp_data[i])
|
||||
|
||||
compared.append(
|
||||
ComparedOffset(
|
||||
offset=member.offset,
|
||||
name=member.name,
|
||||
match=match,
|
||||
values=values,
|
||||
)
|
||||
)
|
||||
|
||||
yield create_comparison_item(var, compared=compared)
|
||||
|
||||
|
||||
def value_get(value: Optional[str], default: str):
|
||||
return value if value is not None else default
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
def display_match(result: CompareResult) -> str:
|
||||
"""Helper to return color string or not, depending on user preference"""
|
||||
if args.no_color:
|
||||
return result.name
|
||||
|
||||
match_color = (
|
||||
colorama.Fore.GREEN
|
||||
if result == CompareResult.MATCH
|
||||
else (
|
||||
colorama.Fore.YELLOW
|
||||
if result == CompareResult.WARN
|
||||
else colorama.Fore.RED
|
||||
)
|
||||
)
|
||||
return f"{match_color}{result.name}{colorama.Style.RESET_ALL}"
|
||||
|
||||
var_count = 0
|
||||
problems = 0
|
||||
|
||||
for item in do_the_comparison(args):
|
||||
var_count += 1
|
||||
if item.result in (CompareResult.DIFF, CompareResult.ERROR):
|
||||
problems += 1
|
||||
|
||||
if not args.show_all and item.result == CompareResult.MATCH:
|
||||
continue
|
||||
|
||||
address_display = (
|
||||
f"0x{item.orig_addr:x} / 0x{item.recomp_addr:x}"
|
||||
if args.print_rec_addr
|
||||
else f"0x{item.orig_addr:x}"
|
||||
)
|
||||
|
||||
print(f"{item.name[:80]} ({address_display}) ... {display_match(item.result)} ")
|
||||
if item.error is not None:
|
||||
print(f" {item.error}")
|
||||
|
||||
for c in item.compared:
|
||||
if not args.verbose and c.match:
|
||||
continue
|
||||
|
||||
(value_a, value_b) = c.values
|
||||
if c.match:
|
||||
print(f" {c.offset:5} {value_get(c.name, '(value)'):30} {value_a}")
|
||||
else:
|
||||
print(
|
||||
f" {c.offset:5} {value_get(c.name, '(value)'):30} {value_a} : {value_b}"
|
||||
)
|
||||
|
||||
if args.verbose:
|
||||
print()
|
||||
|
||||
print(
|
||||
f"{os.path.basename(args.original)} - Variables: {var_count}. Issues: {problems}"
|
||||
)
|
||||
return 0 if problems == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
@ -1,103 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import colorama
|
||||
from isledecomp.dir import walk_source_dir, is_file_cpp
|
||||
from isledecomp.parser import DecompLinter
|
||||
|
||||
colorama.just_fix_windows_console()
|
||||
|
||||
|
||||
def display_errors(alerts, filename):
|
||||
sorted_alerts = sorted(alerts, key=lambda a: a.line_number)
|
||||
|
||||
for alert in sorted_alerts:
|
||||
error_type = (
|
||||
f"{colorama.Fore.RED}error: "
|
||||
if alert.is_error()
|
||||
else f"{colorama.Fore.YELLOW}warning: "
|
||||
)
|
||||
components = [
|
||||
colorama.Fore.LIGHTWHITE_EX,
|
||||
filename,
|
||||
":",
|
||||
str(alert.line_number),
|
||||
" : ",
|
||||
error_type,
|
||||
colorama.Fore.LIGHTWHITE_EX,
|
||||
alert.code.name.lower(),
|
||||
]
|
||||
print("".join(components))
|
||||
|
||||
if alert.line is not None:
|
||||
print(f"{colorama.Fore.WHITE} {alert.line}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Syntax checking and linting for decomp annotation markers."
|
||||
)
|
||||
p.add_argument("target", help="The file or directory to check.")
|
||||
p.add_argument(
|
||||
"--module",
|
||||
required=False,
|
||||
type=str,
|
||||
help="If present, run targeted checks for markers from the given module.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--warnfail",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Fail if syntax warnings are found.",
|
||||
)
|
||||
|
||||
(args, _) = p.parse_known_args()
|
||||
return args
|
||||
|
||||
|
||||
def process_files(files, module=None):
|
||||
warning_count = 0
|
||||
error_count = 0
|
||||
|
||||
linter = DecompLinter()
|
||||
for filename in files:
|
||||
success = linter.check_file(filename, module)
|
||||
|
||||
warnings = [a for a in linter.alerts if a.is_warning()]
|
||||
errors = [a for a in linter.alerts if a.is_error()]
|
||||
|
||||
error_count += len(errors)
|
||||
warning_count += len(warnings)
|
||||
|
||||
if not success:
|
||||
display_errors(linter.alerts, filename)
|
||||
print()
|
||||
|
||||
return (warning_count, error_count)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
files_to_check = []
|
||||
if os.path.isdir(args.target):
|
||||
files_to_check = list(walk_source_dir(args.target))
|
||||
elif os.path.isfile(args.target) and is_file_cpp(args.target):
|
||||
files_to_check = [args.target]
|
||||
else:
|
||||
sys.exit("Invalid target")
|
||||
|
||||
(warning_count, error_count) = process_files(files_to_check, module=args.module)
|
||||
|
||||
print(colorama.Style.RESET_ALL, end="")
|
||||
|
||||
would_fail = error_count > 0 or (warning_count > 0 and args.warnfail)
|
||||
if would_fail:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
@ -1,25 +0,0 @@
|
||||
# Ghidra Scripts
|
||||
|
||||
The scripts in this directory provide additional functionality in Ghidra, e.g. imports of symbols and types from the PDB debug symbol file.
|
||||
|
||||
## Setup
|
||||
|
||||
### Ghidrathon
|
||||
Since these scripts and its dependencies are written in Python 3, [Ghidrathon](https://github.com/mandiant/Ghidrathon) must be installed first. Follow the instructions and install a recent build (these scripts were tested with Python 3.12 and Ghidrathon v4.0.0).
|
||||
|
||||
### Script Directory
|
||||
- In Ghidra, _Open Window -> Script Manager_.
|
||||
- Click the _Manage Script Directories_ button on the top right.
|
||||
- Click the _Add_ (Plus icon) button and select this file's parent directory.
|
||||
- Close the window and click the _Refresh_ button.
|
||||
- This script should now be available under the folder _LEGO1_.
|
||||
|
||||
### Virtual environment
|
||||
As of now, there must be a Python virtual environment set up under `$REPOSITORY_ROOT/.venv`, and the dependencies of `isledecomp` must be installed there, see [here](../README.md#tooling).
|
||||
|
||||
## Development
|
||||
- Type hints for Ghidra (optional): Download a recent release from https://github.com/VDOO-Connected-Trust/ghidra-pyi-generator,
|
||||
unpack it somewhere, and `pip install` that directory in this virtual environment. This provides types and headers for Python.
|
||||
Be aware that some of these files contain errors - in particular, `from typing import overload` seems to be missing everywhere, leading to spurious type errors.
|
||||
- Note that the imported modules persist across multiple runs of the script (see [here](https://github.com/mandiant/Ghidrathon/issues/103)).
|
||||
If you indend to modify an imported library, you have to use `import importlib; importlib.reload(${library})` or restart Ghidra for your changes to have any effect. Unfortunately, even that is not perfectly reliable, so you may still have to restart Ghidra for some changes in `isledecomp` to be applied.
|
@ -1,285 +0,0 @@
|
||||
# Imports types and function signatures from debug symbols (PDB file) of the recompilation.
|
||||
#
|
||||
# This script uses Python 3 and therefore requires Ghidrathon to be installed in Ghidra (see https://github.com/mandiant/Ghidrathon).
|
||||
# Furthermore, the virtual environment must be set up beforehand under $REPOSITORY_ROOT/.venv, and all required packages must be installed
|
||||
# (see $REPOSITORY_ROOT/tools/README.md).
|
||||
# Also, the Python version of the virtual environment must probably match the Python version used for Ghidrathon.
|
||||
|
||||
# @author J. Schulz
|
||||
# @category LEGO1
|
||||
# @keybinding
|
||||
# @menupath
|
||||
# @toolbar
|
||||
|
||||
|
||||
# In order to make this code run both within and outside of Ghidra, the import order is rather unorthodox in this file.
|
||||
# That is why some of the lints below are disabled.
|
||||
|
||||
# pylint: disable=wrong-import-position,ungrouped-imports
|
||||
# pylint: disable=undefined-variable # need to disable this one globally because pylint does not understand e.g. `askYesNo()``
|
||||
|
||||
# Disable spurious warnings in vscode / pylance
|
||||
# pyright: reportMissingModuleSource=false
|
||||
|
||||
import importlib
|
||||
import logging.handlers
|
||||
import sys
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import ghidra
|
||||
from lego_util.headers import * # pylint: disable=wildcard-import # these are just for headers
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def reload_module(module: str):
|
||||
"""
|
||||
Due to a a quirk in Jep (used by Ghidrathon), imported modules persist for the lifetime of the Ghidra process
|
||||
and are not reloaded when relaunching the script. Therefore, in order to facilitate development
|
||||
we force reload all our own modules at startup. See also https://github.com/mandiant/Ghidrathon/issues/103.
|
||||
|
||||
Note that as of 2024-05-30, this remedy does not work perfectly (yet): Some changes in isledecomp are
|
||||
still not detected correctly and require a Ghidra restart to be applied.
|
||||
"""
|
||||
importlib.reload(importlib.import_module(module))
|
||||
|
||||
|
||||
reload_module("lego_util.statistics")
|
||||
reload_module("lego_util.globals")
|
||||
from lego_util.globals import GLOBALS, SupportedModules
|
||||
|
||||
|
||||
def setup_logging():
|
||||
logging.root.handlers.clear()
|
||||
formatter = logging.Formatter("%(levelname)-8s %(message)s")
|
||||
# formatter = logging.Formatter("%(name)s %(levelname)-8s %(message)s") # use this to identify loggers
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
file_handler = logging.FileHandler(
|
||||
Path(__file__).absolute().parent.joinpath("import.log"), mode="w"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
logging.root.setLevel(GLOBALS.loglevel)
|
||||
logging.root.addHandler(stdout_handler)
|
||||
logging.root.addHandler(file_handler)
|
||||
logger.info("Starting import...")
|
||||
|
||||
|
||||
# This script can be run both from Ghidra and as a standalone.
|
||||
# In the latter case, only the PDB parser will be used.
|
||||
setup_logging()
|
||||
try:
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
from ghidra.util.exception import CancelledException
|
||||
|
||||
GLOBALS.running_from_ghidra = True
|
||||
except ImportError as importError:
|
||||
logger.error(
|
||||
"Failed to import Ghidra functions, doing a dry run for the source code parser. "
|
||||
"Has this script been launched from Ghidra?"
|
||||
)
|
||||
logger.debug("Precise import error:", exc_info=importError)
|
||||
|
||||
GLOBALS.running_from_ghidra = False
|
||||
CancelledException = None
|
||||
|
||||
|
||||
def get_repository_root():
|
||||
return Path(__file__).absolute().parent.parent.parent
|
||||
|
||||
|
||||
def add_python_path(path: str):
|
||||
"""
|
||||
Scripts in Ghidra are executed from the tools/ghidra_scripts directory. We need to add
|
||||
a few more paths to the Python path so we can import the other libraries.
|
||||
"""
|
||||
venv_path = get_repository_root().joinpath(path)
|
||||
logger.info("Adding %s to Python Path", venv_path)
|
||||
assert venv_path.exists()
|
||||
sys.path.insert(1, str(venv_path))
|
||||
|
||||
|
||||
# We need to quote the types here because they might not exist when running without Ghidra
|
||||
def import_function_into_ghidra(
|
||||
api: "FlatProgramAPI",
|
||||
pdb_function: "PdbFunction",
|
||||
type_importer: "PdbTypeImporter",
|
||||
):
|
||||
hex_original_address = f"{pdb_function.match_info.orig_addr:x}"
|
||||
|
||||
# Find the Ghidra function at that address
|
||||
ghidra_address = getAddressFactory().getAddress(hex_original_address)
|
||||
# pylint: disable=possibly-used-before-assignment
|
||||
function_importer = PdbFunctionImporter.build(api, pdb_function, type_importer)
|
||||
|
||||
ghidra_function = getFunctionAt(ghidra_address)
|
||||
if ghidra_function is None:
|
||||
ghidra_function = createFunction(ghidra_address, "temp")
|
||||
assert (
|
||||
ghidra_function is not None
|
||||
), f"Failed to create function at {ghidra_address}"
|
||||
logger.info("Created new function at %s", ghidra_address)
|
||||
|
||||
logger.debug("Start handling function '%s'", function_importer.get_full_name())
|
||||
|
||||
if function_importer.matches_ghidra_function(ghidra_function):
|
||||
logger.info(
|
||||
"Skipping function '%s', matches already",
|
||||
function_importer.get_full_name(),
|
||||
)
|
||||
return
|
||||
|
||||
logger.debug(
|
||||
"Modifying function %s at 0x%s",
|
||||
function_importer.get_full_name(),
|
||||
hex_original_address,
|
||||
)
|
||||
|
||||
function_importer.overwrite_ghidra_function(ghidra_function)
|
||||
|
||||
GLOBALS.statistics.functions_changed += 1
|
||||
|
||||
|
||||
def process_functions(extraction: "PdbFunctionExtractor"):
|
||||
pdb_functions = extraction.get_function_list()
|
||||
|
||||
if not GLOBALS.running_from_ghidra:
|
||||
logger.info("Completed the dry run outside Ghidra.")
|
||||
return
|
||||
|
||||
api = FlatProgramAPI(currentProgram())
|
||||
# pylint: disable=possibly-used-before-assignment
|
||||
type_importer = PdbTypeImporter(api, extraction)
|
||||
|
||||
for pdb_func in pdb_functions:
|
||||
func_name = pdb_func.match_info.name
|
||||
try:
|
||||
import_function_into_ghidra(api, pdb_func, type_importer)
|
||||
GLOBALS.statistics.successes += 1
|
||||
except Lego1Exception as e:
|
||||
log_and_track_failure(func_name, e)
|
||||
except RuntimeError as e:
|
||||
cause = e.args[0]
|
||||
if CancelledException is not None and isinstance(cause, CancelledException):
|
||||
# let Ghidra's CancelledException pass through
|
||||
logging.critical("Import aborted by the user.")
|
||||
return
|
||||
|
||||
log_and_track_failure(func_name, cause, unexpected=True)
|
||||
logger.error(traceback.format_exc())
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
log_and_track_failure(func_name, e, unexpected=True)
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
def log_and_track_failure(
|
||||
function_name: Optional[str], error: Exception, unexpected: bool = False
|
||||
):
|
||||
if GLOBALS.statistics.track_failure_and_tell_if_new(error):
|
||||
logger.error(
|
||||
"%s(): %s%s",
|
||||
function_name,
|
||||
"Unexpected error: " if unexpected else "",
|
||||
error,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
if GLOBALS.running_from_ghidra:
|
||||
origfile_name = getProgramFile().getName()
|
||||
|
||||
if origfile_name == "LEGO1.DLL":
|
||||
GLOBALS.module = SupportedModules.LEGO1
|
||||
elif origfile_name in ["LEGO1D.DLL", "BETA10.DLL"]:
|
||||
GLOBALS.module = SupportedModules.BETA10
|
||||
else:
|
||||
raise Lego1Exception(
|
||||
f"Unsupported file name in import script: {origfile_name}"
|
||||
)
|
||||
|
||||
logger.info("Importing file: %s", GLOBALS.module.orig_filename())
|
||||
|
||||
repo_root = get_repository_root()
|
||||
origfile_path = repo_root.joinpath("legobin").joinpath(
|
||||
GLOBALS.module.orig_filename()
|
||||
)
|
||||
build_directory = repo_root.joinpath(GLOBALS.module.build_dir_name())
|
||||
recompiledfile_name = f"{GLOBALS.module.recomp_filename_without_extension()}.DLL"
|
||||
recompiledfile_path = build_directory.joinpath(recompiledfile_name)
|
||||
pdbfile_name = f"{GLOBALS.module.recomp_filename_without_extension()}.PDB"
|
||||
pdbfile_path = build_directory.joinpath(pdbfile_name)
|
||||
|
||||
if not GLOBALS.verbose:
|
||||
logging.getLogger("isledecomp.bin").setLevel(logging.WARNING)
|
||||
logging.getLogger("isledecomp.compare.core").setLevel(logging.WARNING)
|
||||
logging.getLogger("isledecomp.compare.db").setLevel(logging.WARNING)
|
||||
logging.getLogger("isledecomp.compare.lines").setLevel(logging.WARNING)
|
||||
logging.getLogger("isledecomp.cvdump.symbols").setLevel(logging.WARNING)
|
||||
|
||||
logger.info("Starting comparison")
|
||||
with Bin(str(origfile_path), find_str=True) as origfile, Bin(
|
||||
str(recompiledfile_path)
|
||||
) as recompfile:
|
||||
isle_compare = IsleCompare(
|
||||
origfile, recompfile, str(pdbfile_path), str(repo_root)
|
||||
)
|
||||
|
||||
logger.info("Comparison complete.")
|
||||
|
||||
# try to acquire matched functions
|
||||
migration = PdbFunctionExtractor(isle_compare)
|
||||
try:
|
||||
process_functions(migration)
|
||||
finally:
|
||||
if GLOBALS.running_from_ghidra:
|
||||
GLOBALS.statistics.log()
|
||||
|
||||
logger.info("Done")
|
||||
|
||||
|
||||
# sys.path is not reset after running the script, so we should restore it
|
||||
sys_path_backup = sys.path.copy()
|
||||
try:
|
||||
# make modules installed in the venv available in Ghidra
|
||||
add_python_path(".venv/Lib/site-packages")
|
||||
# This one is needed when isledecomp is installed in editable mode in the venv
|
||||
add_python_path("tools/isledecomp")
|
||||
|
||||
import setuptools # pylint: disable=unused-import # required to fix a distutils issue in Python 3.12
|
||||
|
||||
reload_module("isledecomp")
|
||||
from isledecomp import Bin
|
||||
|
||||
reload_module("isledecomp.compare")
|
||||
from isledecomp.compare import Compare as IsleCompare
|
||||
|
||||
reload_module("isledecomp.compare.db")
|
||||
|
||||
reload_module("lego_util.exceptions")
|
||||
from lego_util.exceptions import Lego1Exception
|
||||
|
||||
reload_module("lego_util.pdb_extraction")
|
||||
from lego_util.pdb_extraction import (
|
||||
PdbFunctionExtractor,
|
||||
PdbFunction,
|
||||
)
|
||||
|
||||
if GLOBALS.running_from_ghidra:
|
||||
reload_module("lego_util.ghidra_helper")
|
||||
|
||||
reload_module("lego_util.function_importer")
|
||||
from lego_util.function_importer import PdbFunctionImporter
|
||||
|
||||
reload_module("lego_util.type_importer")
|
||||
from lego_util.type_importer import PdbTypeImporter
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
finally:
|
||||
sys.path = sys_path_backup
|
@ -1,47 +0,0 @@
|
||||
class Lego1Exception(Exception):
|
||||
"""
|
||||
Our own base class for exceptions.
|
||||
Makes it easier to distinguish expected and unexpected errors.
|
||||
"""
|
||||
|
||||
|
||||
class TypeNotFoundError(Lego1Exception):
|
||||
def __str__(self):
|
||||
return f"Type not found in PDB: {self.args[0]}"
|
||||
|
||||
|
||||
class TypeNotFoundInGhidraError(Lego1Exception):
|
||||
def __str__(self):
|
||||
return f"Type not found in Ghidra: {self.args[0]}"
|
||||
|
||||
|
||||
class TypeNotImplementedError(Lego1Exception):
|
||||
def __str__(self):
|
||||
return f"Import not implemented for type: {self.args[0]}"
|
||||
|
||||
|
||||
class ClassOrNamespaceNotFoundInGhidraError(Lego1Exception):
|
||||
def __init__(self, namespaceHierachy: list[str]):
|
||||
super().__init__(namespaceHierachy)
|
||||
|
||||
def get_namespace_str(self) -> str:
|
||||
return "::".join(self.args[0])
|
||||
|
||||
def __str__(self):
|
||||
return f"Class or namespace not found in Ghidra: {self.get_namespace_str()}"
|
||||
|
||||
|
||||
class MultipleTypesFoundInGhidraError(Lego1Exception):
|
||||
def __str__(self):
|
||||
return (
|
||||
f"Found multiple types matching '{self.args[0]}' in Ghidra: {self.args[1]}"
|
||||
)
|
||||
|
||||
|
||||
class StackOffsetMismatchError(Lego1Exception):
|
||||
pass
|
||||
|
||||
|
||||
class StructModificationError(Lego1Exception):
|
||||
def __str__(self):
|
||||
return f"Failed to modify struct in Ghidra: '{self.args[0]}'\nDetailed error: {self.__cause__}"
|
@ -1,421 +0,0 @@
|
||||
# This file can only be imported successfully when run from Ghidra using Ghidrathon.
|
||||
|
||||
# Disable spurious warnings in vscode / pylance
|
||||
# pyright: reportMissingModuleSource=false
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from ghidra.program.model.listing import Function, Parameter
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
from ghidra.program.model.listing import ParameterImpl
|
||||
from ghidra.program.model.symbol import SourceType
|
||||
from ghidra.program.model.data import (
|
||||
TypeDef,
|
||||
TypedefDataType,
|
||||
Pointer,
|
||||
ComponentOffsetSettingsDefinition,
|
||||
)
|
||||
|
||||
from lego_util.pdb_extraction import (
|
||||
PdbFunction,
|
||||
CppRegisterSymbol,
|
||||
CppStackSymbol,
|
||||
)
|
||||
from lego_util.ghidra_helper import (
|
||||
add_data_type_or_reuse_existing,
|
||||
create_ghidra_namespace,
|
||||
get_or_add_pointer_type,
|
||||
get_ghidra_namespace,
|
||||
sanitize_name,
|
||||
)
|
||||
|
||||
from lego_util.exceptions import StackOffsetMismatchError, Lego1Exception
|
||||
from lego_util.type_importer import PdbTypeImporter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdbFunctionImporter(ABC):
|
||||
"""A representation of a function from the PDB with each type replaced by a Ghidra type instance."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api: FlatProgramAPI,
|
||||
func: PdbFunction,
|
||||
type_importer: "PdbTypeImporter",
|
||||
):
|
||||
self.api = api
|
||||
self.match_info = func.match_info
|
||||
self.type_importer = type_importer
|
||||
|
||||
assert self.match_info.name is not None
|
||||
|
||||
colon_split = sanitize_name(self.match_info.name).split("::")
|
||||
self.name = colon_split.pop()
|
||||
namespace_hierachy = colon_split
|
||||
self.namespace = self._do_get_namespace(namespace_hierachy)
|
||||
|
||||
def _do_get_namespace(self, namespace_hierarchy: list[str]):
|
||||
return get_ghidra_namespace(self.api, namespace_hierarchy)
|
||||
|
||||
def get_full_name(self) -> str:
|
||||
return f"{self.namespace.getName()}::{self.name}"
|
||||
|
||||
@staticmethod
|
||||
def build(api: FlatProgramAPI, func: PdbFunction, type_importer: "PdbTypeImporter"):
|
||||
return (
|
||||
ThunkPdbFunctionImport(api, func, type_importer)
|
||||
if func.signature is None
|
||||
else FullPdbFunctionImporter(api, func, type_importer)
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def matches_ghidra_function(self, ghidra_function: Function) -> bool:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def overwrite_ghidra_function(self, ghidra_function: Function):
|
||||
...
|
||||
|
||||
|
||||
class ThunkPdbFunctionImport(PdbFunctionImporter):
|
||||
"""For importing thunk functions (like vtordisp or debug build thunks) into Ghidra.
|
||||
Only the name of the function will be imported."""
|
||||
|
||||
def _do_get_namespace(self, namespace_hierarchy: list[str]):
|
||||
"""We need to create the namespace because we don't import the return type here"""
|
||||
return create_ghidra_namespace(self.api, namespace_hierarchy)
|
||||
|
||||
def matches_ghidra_function(self, ghidra_function: Function) -> bool:
|
||||
name_match = self.name == ghidra_function.getName(False)
|
||||
namespace_match = self.namespace == ghidra_function.getParentNamespace()
|
||||
|
||||
logger.debug("Matches: namespace=%s name=%s", namespace_match, name_match)
|
||||
|
||||
return name_match and namespace_match
|
||||
|
||||
def overwrite_ghidra_function(self, ghidra_function: Function):
|
||||
ghidra_function.setName(self.name, SourceType.USER_DEFINED)
|
||||
ghidra_function.setParentNamespace(self.namespace)
|
||||
|
||||
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
class FullPdbFunctionImporter(PdbFunctionImporter):
|
||||
"""For importing functions into Ghidra where all information are available."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api: FlatProgramAPI,
|
||||
func: PdbFunction,
|
||||
type_importer: "PdbTypeImporter",
|
||||
):
|
||||
super().__init__(api, func, type_importer)
|
||||
|
||||
assert func.signature is not None
|
||||
self.signature = func.signature
|
||||
|
||||
self.is_stub = func.is_stub
|
||||
|
||||
if self.signature.class_type is not None:
|
||||
# Import the base class so the namespace exists
|
||||
self.type_importer.import_pdb_type_into_ghidra(self.signature.class_type)
|
||||
|
||||
self.return_type = type_importer.import_pdb_type_into_ghidra(
|
||||
self.signature.return_type
|
||||
)
|
||||
self.arguments = [
|
||||
ParameterImpl(
|
||||
f"param{index}",
|
||||
type_importer.import_pdb_type_into_ghidra(type_name),
|
||||
api.getCurrentProgram(),
|
||||
)
|
||||
for (index, type_name) in enumerate(self.signature.arglist)
|
||||
]
|
||||
|
||||
def matches_ghidra_function(self, ghidra_function: Function) -> bool:
|
||||
"""Checks whether this function declaration already matches the description in Ghidra"""
|
||||
name_match = self.name == ghidra_function.getName(False)
|
||||
namespace_match = self.namespace == ghidra_function.getParentNamespace()
|
||||
ghidra_return_type = ghidra_function.getReturnType()
|
||||
return_type_match = self.return_type == ghidra_return_type
|
||||
|
||||
# Handle edge case: Return type X that is larger than the return register.
|
||||
# In that case, the function returns `X*` and has another argument `X* __return_storage_ptr`.
|
||||
if (
|
||||
(not return_type_match)
|
||||
and (self.return_type.getLength() > 4)
|
||||
and (
|
||||
get_or_add_pointer_type(self.api, self.return_type)
|
||||
== ghidra_return_type
|
||||
)
|
||||
and any(
|
||||
param
|
||||
for param in ghidra_function.getParameters()
|
||||
if param.getName() == "__return_storage_ptr__"
|
||||
)
|
||||
):
|
||||
logger.debug(
|
||||
"%s has a return type larger than 4 bytes", self.get_full_name()
|
||||
)
|
||||
return_type_match = True
|
||||
|
||||
# match arguments: decide if thiscall or not, and whether the `this` type matches
|
||||
calling_convention_match = (
|
||||
self.signature.call_type == ghidra_function.getCallingConventionName()
|
||||
)
|
||||
|
||||
ghidra_params_without_this = list(ghidra_function.getParameters())
|
||||
|
||||
if calling_convention_match and self.signature.call_type == "__thiscall":
|
||||
this_argument = ghidra_params_without_this.pop(0)
|
||||
calling_convention_match = self._this_type_match(this_argument)
|
||||
|
||||
if self.is_stub:
|
||||
# We do not import the argument list for stubs, so it should be excluded in matches
|
||||
args_match = True
|
||||
elif calling_convention_match:
|
||||
args_match = self._parameter_lists_match(ghidra_params_without_this)
|
||||
else:
|
||||
args_match = False
|
||||
|
||||
logger.debug(
|
||||
"Matches: namespace=%s name=%s return_type=%s calling_convention=%s args=%s",
|
||||
namespace_match,
|
||||
name_match,
|
||||
return_type_match,
|
||||
calling_convention_match,
|
||||
"ignored" if self.is_stub else args_match,
|
||||
)
|
||||
|
||||
return (
|
||||
name_match
|
||||
and namespace_match
|
||||
and return_type_match
|
||||
and calling_convention_match
|
||||
and args_match
|
||||
)
|
||||
|
||||
def _this_type_match(self, this_parameter: Parameter) -> bool:
|
||||
if this_parameter.getName() != "this":
|
||||
logger.info("Expected first argument to be `this` in __thiscall")
|
||||
return False
|
||||
|
||||
if self.signature.this_adjust != 0:
|
||||
# In this case, the `this` argument should be custom defined
|
||||
if not isinstance(this_parameter.getDataType(), TypeDef):
|
||||
logger.info(
|
||||
"`this` argument is not a typedef while `this adjust` = %d",
|
||||
self.signature.this_adjust,
|
||||
)
|
||||
return False
|
||||
# We are not checking for the _correct_ `this` type here, which we could do in the future
|
||||
|
||||
return True
|
||||
|
||||
def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
|
||||
# Remove return storage pointer from comparison if present.
|
||||
# This is relevant to returning values larger than 4 bytes, and is not mentioned in the PDB
|
||||
ghidra_params = [
|
||||
param
|
||||
for param in ghidra_params
|
||||
if param.getName() != "__return_storage_ptr__"
|
||||
]
|
||||
|
||||
if len(self.arguments) != len(ghidra_params):
|
||||
logger.info("Mismatching argument count")
|
||||
return False
|
||||
|
||||
for this_arg, ghidra_arg in zip(self.arguments, ghidra_params):
|
||||
# compare argument types
|
||||
if this_arg.getDataType() != ghidra_arg.getDataType():
|
||||
logger.debug(
|
||||
"Mismatching arg type: expected %s, found %s",
|
||||
this_arg.getDataType(),
|
||||
ghidra_arg.getDataType(),
|
||||
)
|
||||
return False
|
||||
# compare argument names
|
||||
stack_match = self.get_matching_stack_symbol(ghidra_arg.getStackOffset())
|
||||
if stack_match is None:
|
||||
logger.debug("Not found on stack: %s", ghidra_arg)
|
||||
return False
|
||||
|
||||
if stack_match.name.startswith("__formal"):
|
||||
# "__formal" is the placeholder for arguments without a name
|
||||
continue
|
||||
|
||||
if stack_match.name == "__$ReturnUdt":
|
||||
# These appear in templates and cannot be set automatically, as they are a NOTYPE
|
||||
continue
|
||||
|
||||
if stack_match.name != ghidra_arg.getName():
|
||||
logger.debug(
|
||||
"Argument name mismatch: expected %s, found %s",
|
||||
stack_match.name,
|
||||
ghidra_arg.getName(),
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def overwrite_ghidra_function(self, ghidra_function: Function):
|
||||
"""Replace the function declaration in Ghidra by the one derived from C++."""
|
||||
|
||||
if ghidra_function.hasCustomVariableStorage():
|
||||
# Unfortunately, calling `ghidra_function.setCustomVariableStorage(False)`
|
||||
# leads to two `this` parameters. Therefore, we first need to remove all `this` parameters
|
||||
# and then re-generate a new one
|
||||
ghidra_function.replaceParameters(
|
||||
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, # this implicitly sets custom variable storage to False
|
||||
True,
|
||||
SourceType.USER_DEFINED,
|
||||
[
|
||||
param
|
||||
for param in ghidra_function.getParameters()
|
||||
if param.getName() != "this"
|
||||
],
|
||||
)
|
||||
|
||||
if ghidra_function.hasCustomVariableStorage():
|
||||
raise Lego1Exception("Failed to disable custom variable storage.")
|
||||
|
||||
ghidra_function.setName(self.name, SourceType.USER_DEFINED)
|
||||
ghidra_function.setParentNamespace(self.namespace)
|
||||
ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
|
||||
ghidra_function.setCallingConvention(self.signature.call_type)
|
||||
|
||||
if self.is_stub:
|
||||
logger.debug(
|
||||
"%s is a stub, skipping parameter import", self.get_full_name()
|
||||
)
|
||||
else:
|
||||
ghidra_function.replaceParameters(
|
||||
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
|
||||
True, # force
|
||||
SourceType.USER_DEFINED,
|
||||
self.arguments,
|
||||
)
|
||||
self._import_parameter_names(ghidra_function)
|
||||
|
||||
# Special handling for `this adjust` and virtual inheritance
|
||||
if self.signature.this_adjust != 0:
|
||||
self._set_this_adjust(ghidra_function)
|
||||
|
||||
def _import_parameter_names(self, ghidra_function: Function):
|
||||
# When we call `ghidra_function.replaceParameters`, Ghidra will generate the layout.
|
||||
# Now we read the parameters again and match them against the stack layout in the PDB,
|
||||
# both to verify the layout and to set the parameter names.
|
||||
ghidra_parameters: list[Parameter] = ghidra_function.getParameters()
|
||||
|
||||
# Try to add Ghidra function names
|
||||
for index, param in enumerate(ghidra_parameters):
|
||||
if param.isStackVariable():
|
||||
self._rename_stack_parameter(index, param)
|
||||
else:
|
||||
if param.getName() == "this":
|
||||
# 'this' parameters are auto-generated and cannot be changed
|
||||
continue
|
||||
|
||||
# Appears to never happen - could in theory be relevant to __fastcall__ functions,
|
||||
# which we haven't seen yet
|
||||
logger.warning(
|
||||
"Unhandled register variable in %s", self.get_full_name()
|
||||
)
|
||||
continue
|
||||
|
||||
def _rename_stack_parameter(self, index: int, param: Parameter):
|
||||
match = self.get_matching_stack_symbol(param.getStackOffset())
|
||||
if match is None:
|
||||
raise StackOffsetMismatchError(
|
||||
f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}"
|
||||
)
|
||||
|
||||
if match.data_type == "T_NOTYPE(0000)":
|
||||
logger.warning("Skipping stack parameter of type NOTYPE")
|
||||
return
|
||||
|
||||
if param.getDataType() != self.type_importer.import_pdb_type_into_ghidra(
|
||||
match.data_type
|
||||
):
|
||||
logger.error(
|
||||
"Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match
|
||||
)
|
||||
return
|
||||
|
||||
name = match.name
|
||||
if name == "__formal":
|
||||
# these can cause name collisions if multiple ones are present
|
||||
name = f"__formal_{index}"
|
||||
|
||||
param.setName(name, SourceType.USER_DEFINED)
|
||||
|
||||
def get_matching_stack_symbol(self, stack_offset: int) -> Optional[CppStackSymbol]:
|
||||
return next(
|
||||
(
|
||||
symbol
|
||||
for symbol in self.signature.stack_symbols
|
||||
if isinstance(symbol, CppStackSymbol)
|
||||
and symbol.stack_offset == stack_offset
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
def get_matching_register_symbol(
|
||||
self, register: str
|
||||
) -> Optional[CppRegisterSymbol]:
|
||||
return next(
|
||||
(
|
||||
symbol
|
||||
for symbol in self.signature.stack_symbols
|
||||
if isinstance(symbol, CppRegisterSymbol) and symbol.register == register
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
def _set_this_adjust(
|
||||
self,
|
||||
ghidra_function: Function,
|
||||
):
|
||||
"""
|
||||
When `this adjust` is non-zero, the pointer type of `this` needs to be replaced by an offset version.
|
||||
The offset can only be set on a typedef on the pointer. We also must enable custom storage so we can modify
|
||||
the auto-generated `this` parameter.
|
||||
"""
|
||||
|
||||
# Necessary in order to overwite the auto-generated `this`
|
||||
ghidra_function.setCustomVariableStorage(True)
|
||||
|
||||
this_parameter = next(
|
||||
(
|
||||
param
|
||||
for param in ghidra_function.getParameters()
|
||||
if param.isRegisterVariable() and param.getName() == "this"
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
if this_parameter is None:
|
||||
logger.error(
|
||||
"Failed to find `this` parameter in a function with `this adjust = %d`",
|
||||
self.signature.this_adjust,
|
||||
)
|
||||
else:
|
||||
current_ghidra_type = this_parameter.getDataType()
|
||||
assert isinstance(current_ghidra_type, Pointer)
|
||||
class_name = current_ghidra_type.getDataType().getName()
|
||||
typedef_name = f"{class_name}PtrOffset0x{self.signature.this_adjust:x}"
|
||||
|
||||
typedef_ghidra_type = TypedefDataType(
|
||||
current_ghidra_type.getCategoryPath(),
|
||||
typedef_name,
|
||||
current_ghidra_type,
|
||||
)
|
||||
ComponentOffsetSettingsDefinition.DEF.setValue(
|
||||
typedef_ghidra_type.getDefaultSettings(), self.signature.this_adjust
|
||||
)
|
||||
typedef_ghidra_type = add_data_type_or_reuse_existing(
|
||||
self.api, typedef_ghidra_type
|
||||
)
|
||||
|
||||
this_parameter.setDataType(typedef_ghidra_type, SourceType.USER_DEFINED)
|
@ -1,129 +0,0 @@
|
||||
"""A collection of helper functions for the interaction with Ghidra."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from lego_util.exceptions import (
|
||||
ClassOrNamespaceNotFoundInGhidraError,
|
||||
TypeNotFoundInGhidraError,
|
||||
MultipleTypesFoundInGhidraError,
|
||||
)
|
||||
from lego_util.globals import GLOBALS, SupportedModules
|
||||
|
||||
# Disable spurious warnings in vscode / pylance
|
||||
# pyright: reportMissingModuleSource=false
|
||||
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
from ghidra.program.model.data import DataType, DataTypeConflictHandler, PointerDataType
|
||||
from ghidra.program.model.symbol import Namespace
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_ghidra_type(api: FlatProgramAPI, type_name: str):
|
||||
"""
|
||||
Searches for the type named `typeName` in Ghidra.
|
||||
|
||||
Raises:
|
||||
- NotFoundInGhidraError
|
||||
- MultipleTypesFoundInGhidraError
|
||||
"""
|
||||
result = api.getDataTypes(type_name)
|
||||
if len(result) == 0:
|
||||
raise TypeNotFoundInGhidraError(type_name)
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
|
||||
raise MultipleTypesFoundInGhidraError(type_name, result)
|
||||
|
||||
|
||||
def get_or_add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
|
||||
new_pointer_data_type = PointerDataType(pointee)
|
||||
new_pointer_data_type.setCategoryPath(pointee.getCategoryPath())
|
||||
return add_data_type_or_reuse_existing(api, new_pointer_data_type)
|
||||
|
||||
|
||||
def add_data_type_or_reuse_existing(
|
||||
api: FlatProgramAPI, new_data_type: DataType
|
||||
) -> DataType:
|
||||
result_data_type = (
|
||||
api.getCurrentProgram()
|
||||
.getDataTypeManager()
|
||||
.addDataType(new_data_type, DataTypeConflictHandler.KEEP_HANDLER)
|
||||
)
|
||||
if result_data_type is not new_data_type:
|
||||
logger.debug(
|
||||
"Reusing existing data type instead of new one: %s (class: %s)",
|
||||
result_data_type,
|
||||
result_data_type.__class__,
|
||||
)
|
||||
return result_data_type
|
||||
|
||||
|
||||
def get_ghidra_namespace(
|
||||
api: FlatProgramAPI, namespace_hierachy: list[str]
|
||||
) -> Namespace:
|
||||
namespace = api.getCurrentProgram().getGlobalNamespace()
|
||||
for part in namespace_hierachy:
|
||||
namespace = api.getNamespace(namespace, part)
|
||||
if namespace is None:
|
||||
raise ClassOrNamespaceNotFoundInGhidraError(namespace_hierachy)
|
||||
return namespace
|
||||
|
||||
|
||||
def create_ghidra_namespace(
|
||||
api: FlatProgramAPI, namespace_hierachy: list[str]
|
||||
) -> Namespace:
|
||||
namespace = api.getCurrentProgram().getGlobalNamespace()
|
||||
for part in namespace_hierachy:
|
||||
namespace = api.getNamespace(namespace, part)
|
||||
if namespace is None:
|
||||
namespace = api.createNamespace(namespace, part)
|
||||
return namespace
|
||||
|
||||
|
||||
# These appear in debug builds
|
||||
THUNK_OF_RE = re.compile(r"^Thunk of '(.*)'$")
|
||||
|
||||
|
||||
def sanitize_name(name: str) -> str:
|
||||
"""
|
||||
Takes a full class or function name and replaces characters not accepted by Ghidra.
|
||||
Applies mostly to templates, names like `vbase destructor`, and thunks in debug build.
|
||||
"""
|
||||
if (match := THUNK_OF_RE.fullmatch(name)) is not None:
|
||||
is_thunk = True
|
||||
name = match.group(1)
|
||||
else:
|
||||
is_thunk = False
|
||||
|
||||
# Replace characters forbidden in Ghidra
|
||||
new_name = (
|
||||
name.replace("<", "[")
|
||||
.replace(">", "]")
|
||||
.replace("*", "#")
|
||||
.replace(" ", "_")
|
||||
.replace("`", "'")
|
||||
)
|
||||
|
||||
# Importing function names like `FUN_10001234` into BETA10 can be confusing
|
||||
# because Ghidra's auto-generated functions look exactly the same.
|
||||
# Therefore, such function names are replaced by `LEGO_10001234` in the BETA10 import.
|
||||
if GLOBALS.module == SupportedModules.BETA10:
|
||||
new_name = re.sub(r"FUN_([0-9a-f]{8})", r"LEGO1_\1", new_name)
|
||||
|
||||
if "<" in name:
|
||||
new_name = "_template_" + new_name
|
||||
|
||||
if is_thunk:
|
||||
split = new_name.split("::")
|
||||
split[-1] = "_thunk_" + split[-1]
|
||||
new_name = "::".join(split)
|
||||
|
||||
if new_name != name:
|
||||
logger.info(
|
||||
"Changed class or function name from '%s' to '%s' to avoid Ghidra issues",
|
||||
name,
|
||||
new_name,
|
||||
)
|
||||
return new_name
|
@ -1,42 +0,0 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from lego_util.statistics import Statistics
|
||||
|
||||
|
||||
class SupportedModules(Enum):
|
||||
LEGO1 = 1
|
||||
BETA10 = 2
|
||||
|
||||
def orig_filename(self):
|
||||
if self == self.LEGO1:
|
||||
return "LEGO1.DLL"
|
||||
return "BETA10.DLL"
|
||||
|
||||
def recomp_filename_without_extension(self):
|
||||
# in case we want to support more functions
|
||||
return "LEGO1"
|
||||
|
||||
def build_dir_name(self):
|
||||
if self == self.BETA10:
|
||||
return "build_debug"
|
||||
return "build"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Globals:
|
||||
verbose: bool
|
||||
loglevel: int
|
||||
module: SupportedModules
|
||||
running_from_ghidra: bool = False
|
||||
# statistics
|
||||
statistics: Statistics = field(default_factory=Statistics)
|
||||
|
||||
|
||||
# hard-coded settings that we don't want to prompt in Ghidra every time
|
||||
GLOBALS = Globals(
|
||||
verbose=False,
|
||||
# loglevel=logging.INFO,
|
||||
loglevel=logging.DEBUG,
|
||||
module=SupportedModules.LEGO1, # this default value will be used when run outside of Ghidra
|
||||
)
|
@ -1,20 +0,0 @@
|
||||
from typing import TypeVar, Any
|
||||
import ghidra
|
||||
|
||||
# pylint: disable=invalid-name,unused-argument
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
# from ghidra.app.script.GhidraScript
|
||||
def currentProgram() -> "ghidra.program.model.listing.Program": ...
|
||||
def getAddressFactory() -> " ghidra.program.model.address.AddressFactory": ...
|
||||
def state() -> "ghidra.app.script.GhidraState": ...
|
||||
def askChoice(title: str, message: str, choices: list[T], defaultValue: T) -> T: ...
|
||||
def askYesNo(title: str, question: str) -> bool: ...
|
||||
def getFunctionAt(
|
||||
entryPoint: ghidra.program.model.address.Address,
|
||||
) -> ghidra.program.model.listing.Function: ...
|
||||
def createFunction(
|
||||
entryPoint: ghidra.program.model.address.Address, name: str
|
||||
) -> ghidra.program.model.listing.Function: ...
|
||||
def getProgramFile() -> Any: ... # actually java.io.File
|
@ -1,183 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
from typing import Any, Optional
|
||||
import logging
|
||||
|
||||
from isledecomp.bin import InvalidVirtualAddressError
|
||||
from isledecomp.cvdump.symbols import SymbolsEntry
|
||||
from isledecomp.compare import Compare as IsleCompare
|
||||
from isledecomp.compare.db import MatchInfo
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CppStackOrRegisterSymbol:
|
||||
name: str
|
||||
data_type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class CppStackSymbol(CppStackOrRegisterSymbol):
|
||||
stack_offset: int
|
||||
"""Should have a value iff `symbol_type=='S_BPREL32'."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class CppRegisterSymbol(CppStackOrRegisterSymbol):
|
||||
register: str
|
||||
"""Should have a value iff `symbol_type=='S_REGISTER'.` Should always be set/converted to lowercase."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionSignature:
|
||||
original_function_symbol: SymbolsEntry
|
||||
call_type: str
|
||||
arglist: list[str]
|
||||
return_type: str
|
||||
class_type: Optional[str]
|
||||
stack_symbols: list[CppStackOrRegisterSymbol]
|
||||
# if non-zero: an offset to the `this` parameter in a __thiscall
|
||||
this_adjust: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class PdbFunction:
|
||||
match_info: MatchInfo
|
||||
signature: Optional[FunctionSignature]
|
||||
is_stub: bool
|
||||
|
||||
|
||||
class PdbFunctionExtractor:
|
||||
"""
|
||||
Extracts all information on a given function from the parsed PDB
|
||||
and prepares the data for the import in Ghidra.
|
||||
"""
|
||||
|
||||
def __init__(self, compare: IsleCompare):
|
||||
self.compare = compare
|
||||
|
||||
scalar_type_regex = re.compile(r"t_(?P<typename>\w+)(?:\((?P<type_id>\d+)\))?")
|
||||
|
||||
_call_type_map = {
|
||||
"ThisCall": "__thiscall",
|
||||
"C Near": "default",
|
||||
"STD Near": "__stdcall",
|
||||
}
|
||||
|
||||
def _get_cvdump_type(self, type_name: Optional[str]) -> Optional[dict[str, Any]]:
|
||||
return (
|
||||
None
|
||||
if type_name is None
|
||||
else self.compare.cv.types.keys.get(type_name.lower())
|
||||
)
|
||||
|
||||
def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
|
||||
function_type_str = fn.func_type
|
||||
if function_type_str == "T_NOTYPE(0000)":
|
||||
logger.debug("Treating NOTYPE function as thunk: %s", fn.name)
|
||||
return None
|
||||
|
||||
# get corresponding function type
|
||||
|
||||
function_type = self.compare.cv.types.keys.get(function_type_str.lower())
|
||||
if function_type is None:
|
||||
logger.error(
|
||||
"Could not find function type %s for function %s", fn.func_type, fn.name
|
||||
)
|
||||
return None
|
||||
|
||||
class_type = function_type.get("class_type")
|
||||
|
||||
arg_list_type = self._get_cvdump_type(function_type.get("arg_list_type"))
|
||||
assert arg_list_type is not None
|
||||
arg_list_pdb_types = arg_list_type.get("args", [])
|
||||
assert arg_list_type["argcount"] == len(arg_list_pdb_types)
|
||||
|
||||
stack_symbols: list[CppStackOrRegisterSymbol] = []
|
||||
|
||||
# for some unexplained reason, the reported stack is offset by 4 when this flag is set.
|
||||
# Note that this affects the arguments (ebp + ...) but not the function stack (ebp - ...)
|
||||
stack_offset_delta = -4 if fn.frame_pointer_present else 0
|
||||
|
||||
for symbol in fn.stack_symbols:
|
||||
if symbol.symbol_type == "S_REGISTER":
|
||||
stack_symbols.append(
|
||||
CppRegisterSymbol(
|
||||
symbol.name,
|
||||
symbol.data_type,
|
||||
symbol.location,
|
||||
)
|
||||
)
|
||||
elif symbol.symbol_type == "S_BPREL32":
|
||||
stack_offset = int(symbol.location[1:-1], 16)
|
||||
stack_symbols.append(
|
||||
CppStackSymbol(
|
||||
symbol.name,
|
||||
symbol.data_type,
|
||||
stack_offset + stack_offset_delta,
|
||||
)
|
||||
)
|
||||
|
||||
call_type = self._call_type_map[function_type["call_type"]]
|
||||
|
||||
# parse as hex number, default to 0
|
||||
this_adjust = int(function_type.get("this_adjust", "0"), 16)
|
||||
|
||||
return FunctionSignature(
|
||||
original_function_symbol=fn,
|
||||
call_type=call_type,
|
||||
arglist=arg_list_pdb_types,
|
||||
return_type=function_type["return_type"],
|
||||
class_type=class_type,
|
||||
stack_symbols=stack_symbols,
|
||||
this_adjust=this_adjust,
|
||||
)
|
||||
|
||||
def get_function_list(self) -> list[PdbFunction]:
|
||||
handled = (
|
||||
self.handle_matched_function(match)
|
||||
for match in self.compare.get_functions()
|
||||
)
|
||||
return [signature for signature in handled if signature is not None]
|
||||
|
||||
def handle_matched_function(self, match_info: MatchInfo) -> Optional[PdbFunction]:
|
||||
assert match_info.orig_addr is not None
|
||||
match_options = self.compare.get_match_options(match_info.orig_addr)
|
||||
assert match_options is not None
|
||||
|
||||
function_data = next(
|
||||
(
|
||||
y
|
||||
for y in self.compare.cvdump_analysis.nodes
|
||||
if y.addr == match_info.recomp_addr
|
||||
),
|
||||
None,
|
||||
)
|
||||
if function_data is None:
|
||||
try:
|
||||
# this can be either a thunk (which we want) or an external function
|
||||
# (which we don't want), so we tell them apart based on the validity of their address.
|
||||
self.compare.orig_bin.get_relative_addr(match_info.orig_addr)
|
||||
return PdbFunction(match_info, None, False)
|
||||
except InvalidVirtualAddressError:
|
||||
logger.debug(
|
||||
"Skipping external function %s (address 0x%x not in original binary)",
|
||||
match_info.name,
|
||||
match_info.orig_addr,
|
||||
)
|
||||
return None
|
||||
|
||||
function_symbol = function_data.symbol_entry
|
||||
if function_symbol is None:
|
||||
logger.debug(
|
||||
"Could not find function symbol (likely a PUBLICS entry): %s",
|
||||
match_info.name,
|
||||
)
|
||||
return None
|
||||
|
||||
function_signature = self.get_func_signature(function_symbol)
|
||||
|
||||
is_stub = match_options.get("stub", False)
|
||||
|
||||
return PdbFunction(match_info, function_signature, is_stub)
|
@ -1,68 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
|
||||
from lego_util.exceptions import (
|
||||
TypeNotFoundInGhidraError,
|
||||
ClassOrNamespaceNotFoundInGhidraError,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Statistics:
|
||||
functions_changed: int = 0
|
||||
successes: int = 0
|
||||
failures: dict[str, int] = field(default_factory=dict)
|
||||
known_missing_types: dict[str, int] = field(default_factory=dict)
|
||||
known_missing_namespaces: dict[str, int] = field(default_factory=dict)
|
||||
|
||||
def track_failure_and_tell_if_new(self, error: Exception) -> bool:
|
||||
"""
|
||||
Adds the error to the statistics. Returns `False` if logging the error would be redundant
|
||||
(e.g. because it is a `TypeNotFoundInGhidraError` with a type that has been logged before).
|
||||
"""
|
||||
error_type_name = error.__class__.__name__
|
||||
self.failures[error_type_name] = (
|
||||
self.failures.setdefault(error_type_name, 0) + 1
|
||||
)
|
||||
|
||||
if isinstance(error, TypeNotFoundInGhidraError):
|
||||
return self._add_occurence_and_check_if_new(
|
||||
self.known_missing_types, error.args[0]
|
||||
)
|
||||
|
||||
if isinstance(error, ClassOrNamespaceNotFoundInGhidraError):
|
||||
return self._add_occurence_and_check_if_new(
|
||||
self.known_missing_namespaces, error.get_namespace_str()
|
||||
)
|
||||
|
||||
# We do not have detailed tracking for other errors, so we want to log them every time
|
||||
return True
|
||||
|
||||
def _add_occurence_and_check_if_new(self, target: dict[str, int], key: str) -> bool:
|
||||
old_count = target.setdefault(key, 0)
|
||||
target[key] = old_count + 1
|
||||
return old_count == 0
|
||||
|
||||
def log(self):
|
||||
logger.info("Statistics:\n~~~~~")
|
||||
logger.info(
|
||||
"Missing types (with number of occurences): %s\n~~~~~",
|
||||
self.format_statistics(self.known_missing_types),
|
||||
)
|
||||
logger.info(
|
||||
"Missing classes/namespaces (with number of occurences): %s\n~~~~~",
|
||||
self.format_statistics(self.known_missing_namespaces),
|
||||
)
|
||||
logger.info("Successes: %d", self.successes)
|
||||
logger.info("Failures: %s", self.failures)
|
||||
logger.info("Functions changed: %d", self.functions_changed)
|
||||
|
||||
def format_statistics(self, stats: dict[str, int]) -> str:
|
||||
if len(stats) == 0:
|
||||
return "<none>"
|
||||
return ", ".join(
|
||||
f"{entry[0]} ({entry[1]})"
|
||||
for entry in sorted(stats.items(), key=lambda x: x[1], reverse=True)
|
||||
)
|
@ -1,541 +0,0 @@
|
||||
import logging
|
||||
from typing import Any, Callable, Iterator, Optional, TypeVar
|
||||
|
||||
# Disable spurious warnings in vscode / pylance
|
||||
# pyright: reportMissingModuleSource=false
|
||||
|
||||
# pylint: disable=too-many-return-statements # a `match` would be better, but for now we are stuck with Python 3.9
|
||||
# pylint: disable=no-else-return # Not sure why this rule even is a thing, this is great for checking exhaustiveness
|
||||
|
||||
from isledecomp.cvdump.types import VirtualBasePointer
|
||||
from lego_util.exceptions import (
|
||||
ClassOrNamespaceNotFoundInGhidraError,
|
||||
TypeNotFoundError,
|
||||
TypeNotFoundInGhidraError,
|
||||
TypeNotImplementedError,
|
||||
StructModificationError,
|
||||
)
|
||||
from lego_util.ghidra_helper import (
|
||||
add_data_type_or_reuse_existing,
|
||||
get_or_add_pointer_type,
|
||||
create_ghidra_namespace,
|
||||
get_ghidra_namespace,
|
||||
get_ghidra_type,
|
||||
sanitize_name,
|
||||
)
|
||||
from lego_util.pdb_extraction import PdbFunctionExtractor
|
||||
|
||||
from ghidra.program.flatapi import FlatProgramAPI
|
||||
from ghidra.program.model.data import (
|
||||
ArrayDataType,
|
||||
CategoryPath,
|
||||
DataType,
|
||||
DataTypeConflictHandler,
|
||||
Enum,
|
||||
EnumDataType,
|
||||
StructureDataType,
|
||||
StructureInternal,
|
||||
TypedefDataType,
|
||||
ComponentOffsetSettingsDefinition,
|
||||
)
|
||||
from ghidra.util.task import ConsoleTaskMonitor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdbTypeImporter:
|
||||
"""Allows PDB types to be imported into Ghidra."""
|
||||
|
||||
def __init__(self, api: FlatProgramAPI, extraction: PdbFunctionExtractor):
|
||||
self.api = api
|
||||
self.extraction = extraction
|
||||
# tracks the structs/classes we have already started to import, otherwise we run into infinite recursion
|
||||
self.handled_structs: set[str] = set()
|
||||
|
||||
# tracks the enums we have already handled for the sake of efficiency
|
||||
self.handled_enums: dict[str, Enum] = {}
|
||||
|
||||
@property
|
||||
def types(self):
|
||||
return self.extraction.compare.cv.types
|
||||
|
||||
def import_pdb_type_into_ghidra(
|
||||
self, type_index: str, slim_for_vbase: bool = False
|
||||
) -> DataType:
|
||||
"""
|
||||
Recursively imports a type from the PDB into Ghidra.
|
||||
@param type_index Either a scalar type like `T_INT4(...)` or a PDB reference like `0x10ba`
|
||||
@param slim_for_vbase If true, the current invocation
|
||||
imports a superclass of some class where virtual inheritance is involved (directly or indirectly).
|
||||
This case requires special handling: Let's say we have `class C: B` and `class B: virtual A`. Then cvdump
|
||||
reports a size for B that includes both B's fields as well as the A contained at an offset within B,
|
||||
which is not the correct structure to be contained in C. Therefore, we need to create a "slim" version of B
|
||||
that fits inside C.
|
||||
This value should always be `False` when the referenced type is not (a pointer to) a class.
|
||||
"""
|
||||
type_index_lower = type_index.lower()
|
||||
if type_index_lower.startswith("t_"):
|
||||
return self._import_scalar_type(type_index_lower)
|
||||
|
||||
try:
|
||||
type_pdb = self.extraction.compare.cv.types.keys[type_index_lower]
|
||||
except KeyError as e:
|
||||
raise TypeNotFoundError(
|
||||
f"Failed to find referenced type '{type_index_lower}'"
|
||||
) from e
|
||||
|
||||
type_category = type_pdb["type"]
|
||||
|
||||
# follow forward reference (class, struct, union)
|
||||
if type_pdb.get("is_forward_ref", False):
|
||||
return self._import_forward_ref_type(
|
||||
type_index_lower, type_pdb, slim_for_vbase
|
||||
)
|
||||
|
||||
if type_category == "LF_POINTER":
|
||||
return get_or_add_pointer_type(
|
||||
self.api,
|
||||
self.import_pdb_type_into_ghidra(
|
||||
type_pdb["element_type"], slim_for_vbase
|
||||
),
|
||||
)
|
||||
elif type_category in ["LF_CLASS", "LF_STRUCTURE"]:
|
||||
return self._import_class_or_struct(type_pdb, slim_for_vbase)
|
||||
elif type_category == "LF_ARRAY":
|
||||
return self._import_array(type_pdb)
|
||||
elif type_category == "LF_ENUM":
|
||||
return self._import_enum(type_pdb)
|
||||
elif type_category == "LF_PROCEDURE":
|
||||
logger.warning(
|
||||
"Not implemented: Function-valued argument or return type will be replaced by void pointer: %s",
|
||||
type_pdb,
|
||||
)
|
||||
return get_ghidra_type(self.api, "void")
|
||||
elif type_category == "LF_UNION":
|
||||
return self._import_union(type_pdb)
|
||||
else:
|
||||
raise TypeNotImplementedError(type_pdb)
|
||||
|
||||
_scalar_type_map = {
|
||||
"rchar": "char",
|
||||
"int4": "int",
|
||||
"uint4": "uint",
|
||||
"real32": "float",
|
||||
"real64": "double",
|
||||
}
|
||||
|
||||
def _scalar_type_to_cpp(self, scalar_type: str) -> str:
|
||||
if scalar_type.startswith("32p"):
|
||||
return f"{self._scalar_type_to_cpp(scalar_type[3:])} *"
|
||||
return self._scalar_type_map.get(scalar_type, scalar_type)
|
||||
|
||||
def _import_scalar_type(self, type_index_lower: str) -> DataType:
|
||||
if (match := self.extraction.scalar_type_regex.match(type_index_lower)) is None:
|
||||
raise TypeNotFoundError(f"Type has unexpected format: {type_index_lower}")
|
||||
|
||||
scalar_cpp_type = self._scalar_type_to_cpp(match.group("typename"))
|
||||
return get_ghidra_type(self.api, scalar_cpp_type)
|
||||
|
||||
def _import_forward_ref_type(
|
||||
self,
|
||||
type_index,
|
||||
type_pdb: dict[str, Any],
|
||||
slim_for_vbase: bool = False,
|
||||
) -> DataType:
|
||||
referenced_type = type_pdb.get("udt") or type_pdb.get("modifies")
|
||||
if referenced_type is None:
|
||||
try:
|
||||
# Example: HWND__, needs to be created manually
|
||||
return get_ghidra_type(self.api, type_pdb["name"])
|
||||
except TypeNotFoundInGhidraError as e:
|
||||
raise TypeNotImplementedError(
|
||||
f"{type_index}: forward ref without target, needs to be created manually: {type_pdb}"
|
||||
) from e
|
||||
logger.debug(
|
||||
"Following forward reference from %s to %s",
|
||||
type_index,
|
||||
referenced_type,
|
||||
)
|
||||
return self.import_pdb_type_into_ghidra(referenced_type, slim_for_vbase)
|
||||
|
||||
def _import_array(self, type_pdb: dict[str, Any]) -> DataType:
|
||||
inner_type = self.import_pdb_type_into_ghidra(type_pdb["array_type"])
|
||||
|
||||
array_total_bytes: int = type_pdb["size"]
|
||||
data_type_size = inner_type.getLength()
|
||||
array_length, modulus = divmod(array_total_bytes, data_type_size)
|
||||
assert (
|
||||
modulus == 0
|
||||
), f"Data type size {data_type_size} does not divide array size {array_total_bytes}"
|
||||
|
||||
return ArrayDataType(inner_type, array_length, 0)
|
||||
|
||||
def _import_union(self, type_pdb: dict[str, Any]) -> DataType:
|
||||
try:
|
||||
logger.debug("Dereferencing union %s", type_pdb)
|
||||
union_type = get_ghidra_type(self.api, type_pdb["name"])
|
||||
assert (
|
||||
union_type.getLength() == type_pdb["size"]
|
||||
), f"Wrong size of existing union type '{type_pdb['name']}': expected {type_pdb['size']}, got {union_type.getLength()}"
|
||||
return union_type
|
||||
except TypeNotFoundInGhidraError as e:
|
||||
# We have so few instances, it is not worth implementing this
|
||||
raise TypeNotImplementedError(
|
||||
f"Writing union types is not supported. Please add by hand: {type_pdb}"
|
||||
) from e
|
||||
|
||||
def _import_enum(self, type_pdb: dict[str, Any]) -> DataType:
|
||||
underlying_type = self.import_pdb_type_into_ghidra(type_pdb["underlying_type"])
|
||||
field_list = self.extraction.compare.cv.types.keys.get(type_pdb["field_type"])
|
||||
assert field_list is not None, f"Failed to find field list for enum {type_pdb}"
|
||||
|
||||
result = self._get_or_create_enum_data_type(
|
||||
type_pdb["name"], underlying_type.getLength()
|
||||
)
|
||||
# clear existing variant if there are any
|
||||
for existing_variant in result.getNames():
|
||||
result.remove(existing_variant)
|
||||
|
||||
variants: list[dict[str, Any]] = field_list["variants"]
|
||||
for variant in variants:
|
||||
result.add(variant["name"], variant["value"])
|
||||
|
||||
return result
|
||||
|
||||
def _import_class_or_struct(
|
||||
self,
|
||||
type_in_pdb: dict[str, Any],
|
||||
slim_for_vbase: bool = False,
|
||||
) -> DataType:
|
||||
field_list_type: str = type_in_pdb["field_list_type"]
|
||||
field_list = self.types.keys[field_list_type.lower()]
|
||||
|
||||
class_size: int = type_in_pdb["size"]
|
||||
class_name_with_namespace: str = sanitize_name(type_in_pdb["name"])
|
||||
if slim_for_vbase:
|
||||
class_name_with_namespace += "_vbase_slim"
|
||||
|
||||
if class_name_with_namespace in self.handled_structs:
|
||||
logger.debug(
|
||||
"Class has been handled or is being handled: %s",
|
||||
class_name_with_namespace,
|
||||
)
|
||||
return get_ghidra_type(self.api, class_name_with_namespace)
|
||||
|
||||
logger.debug(
|
||||
"--- Beginning to import class/struct '%s'", class_name_with_namespace
|
||||
)
|
||||
|
||||
# Add as soon as we start to avoid infinite recursion
|
||||
self.handled_structs.add(class_name_with_namespace)
|
||||
|
||||
self._get_or_create_namespace(class_name_with_namespace)
|
||||
|
||||
new_ghidra_struct = self._get_or_create_struct_data_type(
|
||||
class_name_with_namespace, class_size
|
||||
)
|
||||
|
||||
if (old_size := new_ghidra_struct.getLength()) != class_size:
|
||||
logger.warning(
|
||||
"Existing class %s had incorrect size %d. Setting to %d...",
|
||||
class_name_with_namespace,
|
||||
old_size,
|
||||
class_size,
|
||||
)
|
||||
|
||||
logger.info("Adding class data type %s", class_name_with_namespace)
|
||||
logger.debug("Class information: %s", type_in_pdb)
|
||||
|
||||
components: list[dict[str, Any]] = []
|
||||
components.extend(self._get_components_from_base_classes(field_list))
|
||||
# can be missing when no new fields are declared
|
||||
components.extend(self._get_components_from_members(field_list))
|
||||
components.extend(
|
||||
self._get_components_from_vbase(
|
||||
field_list, class_name_with_namespace, new_ghidra_struct
|
||||
)
|
||||
)
|
||||
|
||||
components.sort(key=lambda c: c["offset"])
|
||||
|
||||
if slim_for_vbase:
|
||||
# Make a "slim" version: shrink the size to the fields that are actually present.
|
||||
# This makes a difference when the current class uses virtual inheritance
|
||||
assert (
|
||||
len(components) > 0
|
||||
), f"Error: {class_name_with_namespace} should not be empty. There must be at least one direct or indirect vbase pointer."
|
||||
last_component = components[-1]
|
||||
class_size = last_component["offset"] + last_component["type"].getLength()
|
||||
|
||||
self._overwrite_struct(
|
||||
class_name_with_namespace,
|
||||
new_ghidra_struct,
|
||||
class_size,
|
||||
components,
|
||||
)
|
||||
|
||||
logger.info("Finished importing class %s", class_name_with_namespace)
|
||||
|
||||
return new_ghidra_struct
|
||||
|
||||
def _get_components_from_base_classes(self, field_list) -> Iterator[dict[str, Any]]:
|
||||
non_virtual_base_classes: dict[str, int] = field_list.get("super", {})
|
||||
|
||||
for super_type, offset in non_virtual_base_classes.items():
|
||||
# If we have virtual inheritance _and_ a non-virtual base class here, we play safe and import slim version.
|
||||
# This is technically not needed if only one of the superclasses uses virtual inheritance, but I am not aware of any instance.
|
||||
import_slim_vbase_version_of_superclass = "vbase" in field_list
|
||||
ghidra_type = self.import_pdb_type_into_ghidra(
|
||||
super_type, slim_for_vbase=import_slim_vbase_version_of_superclass
|
||||
)
|
||||
|
||||
yield {
|
||||
"type": ghidra_type,
|
||||
"offset": offset,
|
||||
"name": "base" if offset == 0 else f"base_{ghidra_type.getName()}",
|
||||
}
|
||||
|
||||
def _get_components_from_members(self, field_list: dict[str, Any]):
|
||||
members: list[dict[str, Any]] = field_list.get("members") or []
|
||||
for member in members:
|
||||
yield member | {"type": self.import_pdb_type_into_ghidra(member["type"])}
|
||||
|
||||
def _get_components_from_vbase(
|
||||
self,
|
||||
field_list: dict[str, Any],
|
||||
class_name_with_namespace: str,
|
||||
current_type: StructureInternal,
|
||||
) -> Iterator[dict[str, Any]]:
|
||||
vbasepointer: Optional[VirtualBasePointer] = field_list.get("vbase", None)
|
||||
|
||||
if vbasepointer is not None and any(x.direct for x in vbasepointer.bases):
|
||||
vbaseptr_type = get_or_add_pointer_type(
|
||||
self.api,
|
||||
self._import_vbaseptr(
|
||||
current_type, class_name_with_namespace, vbasepointer
|
||||
),
|
||||
)
|
||||
yield {
|
||||
"type": vbaseptr_type,
|
||||
"offset": vbasepointer.vboffset,
|
||||
"name": "vbase_offset",
|
||||
}
|
||||
|
||||
def _import_vbaseptr(
|
||||
self,
|
||||
current_type: StructureInternal,
|
||||
class_name_with_namespace: str,
|
||||
vbasepointer: VirtualBasePointer,
|
||||
) -> StructureInternal:
|
||||
pointer_size = 4 # hard-code to 4 because of 32 bit
|
||||
|
||||
components = [
|
||||
{
|
||||
"offset": 0,
|
||||
"type": get_or_add_pointer_type(self.api, current_type),
|
||||
"name": "o_self",
|
||||
}
|
||||
]
|
||||
for vbase in vbasepointer.bases:
|
||||
vbase_ghidra_type = self.import_pdb_type_into_ghidra(vbase.type)
|
||||
|
||||
type_name = vbase_ghidra_type.getName()
|
||||
|
||||
vbase_ghidra_pointer = get_or_add_pointer_type(self.api, vbase_ghidra_type)
|
||||
vbase_ghidra_pointer_typedef = TypedefDataType(
|
||||
vbase_ghidra_pointer.getCategoryPath(),
|
||||
f"{type_name}PtrOffset",
|
||||
vbase_ghidra_pointer,
|
||||
)
|
||||
# Set a default value of -4 for the pointer offset. While this appears to be correct in many cases,
|
||||
# it does not always lead to the best decompile. It can be fine-tuned by hand; the next function call
|
||||
# makes sure that we don't overwrite this value on re-running the import.
|
||||
ComponentOffsetSettingsDefinition.DEF.setValue(
|
||||
vbase_ghidra_pointer_typedef.getDefaultSettings(), -4
|
||||
)
|
||||
|
||||
vbase_ghidra_pointer_typedef = add_data_type_or_reuse_existing(
|
||||
self.api, vbase_ghidra_pointer_typedef
|
||||
)
|
||||
|
||||
components.append(
|
||||
{
|
||||
"offset": vbase.index * pointer_size,
|
||||
"type": vbase_ghidra_pointer_typedef,
|
||||
"name": f"o_{type_name}",
|
||||
}
|
||||
)
|
||||
|
||||
size = len(components) * pointer_size
|
||||
|
||||
new_ghidra_struct = self._get_or_create_struct_data_type(
|
||||
f"{class_name_with_namespace}::VBasePtr", size
|
||||
)
|
||||
|
||||
self._overwrite_struct(
|
||||
f"{class_name_with_namespace}::VBasePtr",
|
||||
new_ghidra_struct,
|
||||
size,
|
||||
components,
|
||||
)
|
||||
|
||||
return new_ghidra_struct
|
||||
|
||||
def _overwrite_struct(
|
||||
self,
|
||||
class_name_with_namespace: str,
|
||||
new_ghidra_struct: StructureInternal,
|
||||
class_size: int,
|
||||
components: list[dict[str, Any]],
|
||||
):
|
||||
new_ghidra_struct.deleteAll()
|
||||
new_ghidra_struct.growStructure(class_size)
|
||||
|
||||
# this case happened e.g. for IUnknown, which linked to an (incorrect) existing library, and some other types as well.
|
||||
# Unfortunately, we don't get proper error handling for read-only types.
|
||||
# However, we really do NOT want to do this every time because the type might be self-referential and partially imported.
|
||||
if new_ghidra_struct.getLength() != class_size:
|
||||
new_ghidra_struct = self._delete_and_recreate_struct_data_type(
|
||||
class_name_with_namespace, class_size, new_ghidra_struct
|
||||
)
|
||||
|
||||
for component in components:
|
||||
offset: int = component["offset"]
|
||||
logger.debug(
|
||||
"Adding component %s to class: %s", component, class_name_with_namespace
|
||||
)
|
||||
|
||||
try:
|
||||
# Make sure there is room for the new structure and that we have no collision.
|
||||
existing_type = new_ghidra_struct.getComponentAt(offset)
|
||||
assert (
|
||||
existing_type is not None
|
||||
), f"Struct collision: Offset {offset} in {class_name_with_namespace} is overlapped by another component"
|
||||
|
||||
if existing_type.getDataType().getName() != "undefined":
|
||||
# collision of structs beginning in the same place -> likely due to unions
|
||||
logger.warning(
|
||||
"Struct collision: Offset %d of %s already has a field (likely an inline union)",
|
||||
offset,
|
||||
class_name_with_namespace,
|
||||
)
|
||||
|
||||
new_ghidra_struct.replaceAtOffset(
|
||||
offset,
|
||||
component["type"],
|
||||
-1, # set to -1 for fixed-size components
|
||||
component["name"], # name
|
||||
None, # comment
|
||||
)
|
||||
except Exception as e:
|
||||
raise StructModificationError(class_name_with_namespace) from e
|
||||
|
||||
def _get_or_create_namespace(self, class_name_with_namespace: str):
|
||||
colon_split = class_name_with_namespace.split("::")
|
||||
class_name = colon_split[-1]
|
||||
try:
|
||||
get_ghidra_namespace(self.api, colon_split)
|
||||
logger.debug("Found existing class/namespace %s", class_name_with_namespace)
|
||||
except ClassOrNamespaceNotFoundInGhidraError:
|
||||
logger.info("Creating class/namespace %s", class_name_with_namespace)
|
||||
class_name = colon_split.pop()
|
||||
parent_namespace = create_ghidra_namespace(self.api, colon_split)
|
||||
self.api.createClass(parent_namespace, class_name)
|
||||
|
||||
def _get_or_create_enum_data_type(
|
||||
self, enum_type_name: str, enum_type_size: int
|
||||
) -> Enum:
|
||||
if (known_enum := self.handled_enums.get(enum_type_name, None)) is not None:
|
||||
return known_enum
|
||||
|
||||
result = self._get_or_create_data_type(
|
||||
enum_type_name,
|
||||
"enum",
|
||||
Enum,
|
||||
lambda: EnumDataType(
|
||||
CategoryPath("/imported"), enum_type_name, enum_type_size
|
||||
),
|
||||
)
|
||||
self.handled_enums[enum_type_name] = result
|
||||
return result
|
||||
|
||||
def _get_or_create_struct_data_type(
|
||||
self, class_name_with_namespace: str, class_size: int
|
||||
) -> StructureInternal:
|
||||
return self._get_or_create_data_type(
|
||||
class_name_with_namespace,
|
||||
"class/struct",
|
||||
StructureInternal,
|
||||
lambda: StructureDataType(
|
||||
CategoryPath("/imported"), class_name_with_namespace, class_size
|
||||
),
|
||||
)
|
||||
|
||||
T = TypeVar("T", bound=DataType)
|
||||
|
||||
def _get_or_create_data_type(
|
||||
self,
|
||||
type_name: str,
|
||||
readable_name_of_type_category: str,
|
||||
expected_type: type[T],
|
||||
new_instance_callback: Callable[[], T],
|
||||
) -> T:
|
||||
"""
|
||||
Checks if a data type provided under the given name exists in Ghidra.
|
||||
Creates one using `new_instance_callback` if there is not.
|
||||
Also verifies the data type.
|
||||
|
||||
Note that the return value of `addDataType()` is not the same instance as the input
|
||||
even if there is no name collision.
|
||||
"""
|
||||
try:
|
||||
data_type = get_ghidra_type(self.api, type_name)
|
||||
logger.debug(
|
||||
"Found existing %s type %s under category path %s",
|
||||
readable_name_of_type_category,
|
||||
type_name,
|
||||
data_type.getCategoryPath(),
|
||||
)
|
||||
except TypeNotFoundInGhidraError:
|
||||
data_type = (
|
||||
self.api.getCurrentProgram()
|
||||
.getDataTypeManager()
|
||||
.addDataType(
|
||||
new_instance_callback(), DataTypeConflictHandler.KEEP_HANDLER
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"Created new %s data type %s", readable_name_of_type_category, type_name
|
||||
)
|
||||
assert isinstance(
|
||||
data_type, expected_type
|
||||
), f"Found existing type named {type_name} that is not a {readable_name_of_type_category}"
|
||||
return data_type
|
||||
|
||||
def _delete_and_recreate_struct_data_type(
|
||||
self,
|
||||
class_name_with_namespace: str,
|
||||
class_size: int,
|
||||
existing_data_type: DataType,
|
||||
) -> StructureInternal:
|
||||
logger.warning(
|
||||
"Failed to modify data type %s. Will try to delete the existing one and re-create the imported one.",
|
||||
class_name_with_namespace,
|
||||
)
|
||||
|
||||
assert (
|
||||
self.api.getCurrentProgram()
|
||||
.getDataTypeManager()
|
||||
.remove(existing_data_type, ConsoleTaskMonitor())
|
||||
), f"Failed to delete and re-create data type {class_name_with_namespace}"
|
||||
data_type = StructureDataType(
|
||||
CategoryPath("/imported"), class_name_with_namespace, class_size
|
||||
)
|
||||
data_type = (
|
||||
self.api.getCurrentProgram()
|
||||
.getDataTypeManager()
|
||||
.addDataType(data_type, DataTypeConflictHandler.KEEP_HANDLER)
|
||||
)
|
||||
assert isinstance(data_type, StructureInternal) # for type checking
|
||||
return data_type
|
2
tools/isledecomp/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
isledecomp.egg-info/
|
||||
build
|
@ -1,4 +0,0 @@
|
||||
from .bin import *
|
||||
from .dir import *
|
||||
from .parser import *
|
||||
from .utils import *
|
@ -1,574 +0,0 @@
|
||||
import logging
|
||||
import struct
|
||||
import bisect
|
||||
from functools import cached_property
|
||||
from typing import Iterator, List, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
class MZHeaderNotFoundError(Exception):
|
||||
"""MZ magic string not found at the start of the binary."""
|
||||
|
||||
|
||||
class PEHeaderNotFoundError(Exception):
|
||||
"""PE magic string not found at the offset given in 0x3c."""
|
||||
|
||||
|
||||
class SectionNotFoundError(KeyError):
|
||||
"""The specified section was not found in the file."""
|
||||
|
||||
|
||||
class InvalidVirtualAddressError(IndexError):
|
||||
"""The given virtual address is too high or low
|
||||
to point to something in the binary file."""
|
||||
|
||||
|
||||
PEHeader = namedtuple(
|
||||
"PEHeader",
|
||||
[
|
||||
"Signature",
|
||||
"Machine",
|
||||
"NumberOfSections",
|
||||
"TimeDateStamp",
|
||||
"PointerToSymbolTable", # deprecated
|
||||
"NumberOfSymbols", # deprecated
|
||||
"SizeOfOptionalHeader",
|
||||
"Characteristics",
|
||||
],
|
||||
)
|
||||
|
||||
ImageSectionHeader = namedtuple(
|
||||
"ImageSectionHeader",
|
||||
[
|
||||
"name",
|
||||
"virtual_size",
|
||||
"virtual_address",
|
||||
"size_of_raw_data",
|
||||
"pointer_to_raw_data",
|
||||
"pointer_to_relocations",
|
||||
"pointer_to_line_numbers",
|
||||
"number_of_relocations",
|
||||
"number_of_line_numbers",
|
||||
"characteristics",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Section:
|
||||
name: str
|
||||
virtual_size: int
|
||||
virtual_address: int
|
||||
view: memoryview
|
||||
|
||||
@cached_property
|
||||
def size_of_raw_data(self) -> int:
|
||||
return len(self.view)
|
||||
|
||||
@cached_property
|
||||
def extent(self):
|
||||
"""Get the highest possible offset of this section"""
|
||||
return max(self.size_of_raw_data, self.virtual_size)
|
||||
|
||||
def match_name(self, name: str) -> bool:
|
||||
return self.name == name
|
||||
|
||||
def contains_vaddr(self, vaddr: int) -> bool:
|
||||
return self.virtual_address <= vaddr < self.virtual_address + self.extent
|
||||
|
||||
def read_virtual(self, vaddr: int, size: int) -> memoryview:
|
||||
ofs = vaddr - self.virtual_address
|
||||
|
||||
# Negative index will read from the end, which we don't want
|
||||
if ofs < 0:
|
||||
raise InvalidVirtualAddressError
|
||||
|
||||
try:
|
||||
return self.view[ofs : ofs + size]
|
||||
except IndexError as ex:
|
||||
raise InvalidVirtualAddressError from ex
|
||||
|
||||
def addr_is_uninitialized(self, vaddr: int) -> bool:
|
||||
"""We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
|
||||
the characteristics field so instead we determine it this way."""
|
||||
if not self.contains_vaddr(vaddr):
|
||||
return False
|
||||
|
||||
# Should include the case where size_of_raw_data == 0,
|
||||
# meaning the entire section is uninitialized
|
||||
return (self.virtual_size > self.size_of_raw_data) and (
|
||||
vaddr - self.virtual_address >= self.size_of_raw_data
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Bin:
|
||||
"""Parses a PE format EXE and allows reading data from a virtual address.
|
||||
Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
|
||||
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
|
||||
def __init__(self, filename: str, find_str: bool = False) -> None:
|
||||
logger.debug('Parsing headers of "%s"... ', filename)
|
||||
self.filename = filename
|
||||
self.view: memoryview = None
|
||||
self.imagebase = None
|
||||
self.entry = None
|
||||
self.sections: List[Section] = []
|
||||
self._section_vaddr: List[int] = []
|
||||
self.find_str = find_str
|
||||
self._potential_strings = {}
|
||||
self._relocations = set()
|
||||
self._relocated_addrs = set()
|
||||
self.imports = []
|
||||
self.thunks = []
|
||||
self.exports: List[Tuple[int, str]] = []
|
||||
self.is_debug: bool = False
|
||||
|
||||
def __enter__(self):
|
||||
logger.debug("Bin %s Enter", self.filename)
|
||||
with open(self.filename, "rb") as f:
|
||||
self.view = memoryview(f.read())
|
||||
|
||||
(mz_str,) = struct.unpack("2s", self.view[0:2])
|
||||
if mz_str != b"MZ":
|
||||
raise MZHeaderNotFoundError
|
||||
|
||||
# Skip to PE header offset in MZ header.
|
||||
(pe_header_start,) = struct.unpack("<I", self.view[0x3C:0x40])
|
||||
|
||||
# PE header offset is absolute, so seek there
|
||||
pe_header_view = self.view[pe_header_start:]
|
||||
pe_hdr = PEHeader(*struct.unpack("<2s2x2H3I2H", pe_header_view[:0x18]))
|
||||
|
||||
if pe_hdr.Signature != b"PE":
|
||||
raise PEHeaderNotFoundError
|
||||
|
||||
optional_hdr = pe_header_view[0x18:]
|
||||
(self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20])
|
||||
(entry,) = struct.unpack("<i", optional_hdr[0x10:0x14])
|
||||
self.entry = entry + self.imagebase
|
||||
|
||||
(number_of_rva,) = struct.unpack("<i", optional_hdr[0x5C:0x60])
|
||||
data_dictionaries = [
|
||||
*struct.iter_unpack("<2I", optional_hdr[0x60 : 0x60 + number_of_rva * 8])
|
||||
]
|
||||
|
||||
# Check for presence of .debug subsection in .rdata
|
||||
try:
|
||||
if data_dictionaries[6][0] != 0:
|
||||
self.is_debug = True
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
headers_view = optional_hdr[
|
||||
pe_hdr.SizeOfOptionalHeader : pe_hdr.SizeOfOptionalHeader
|
||||
+ 0x28 * pe_hdr.NumberOfSections
|
||||
]
|
||||
section_headers = [
|
||||
ImageSectionHeader(*h) for h in struct.iter_unpack("<8s6I2HI", headers_view)
|
||||
]
|
||||
|
||||
self.sections = [
|
||||
Section(
|
||||
name=hdr.name.decode("ascii").rstrip("\x00"),
|
||||
virtual_address=self.imagebase + hdr.virtual_address,
|
||||
virtual_size=hdr.virtual_size,
|
||||
view=self.view[
|
||||
hdr.pointer_to_raw_data : hdr.pointer_to_raw_data
|
||||
+ hdr.size_of_raw_data
|
||||
],
|
||||
)
|
||||
for hdr in section_headers
|
||||
]
|
||||
|
||||
# bisect does not support key on the github CI version of python
|
||||
self._section_vaddr = [section.virtual_address for section in self.sections]
|
||||
|
||||
self._populate_relocations()
|
||||
self._populate_imports()
|
||||
self._populate_thunks()
|
||||
# Export dir is always first
|
||||
self._populate_exports(*data_dictionaries[0])
|
||||
|
||||
# This is a (semi) expensive lookup that is not necesssary in every case.
|
||||
# We can find strings in the original if we have coverage using STRING markers.
|
||||
# For the recomp, we can find strings using the PDB.
|
||||
if self.find_str:
|
||||
self._prepare_string_search()
|
||||
|
||||
logger.debug("... Parsing finished")
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_traceback):
|
||||
logger.debug("Bin %s Exit", self.filename)
|
||||
self.view.release()
|
||||
|
||||
def get_relocated_addresses(self) -> List[int]:
|
||||
return sorted(self._relocated_addrs)
|
||||
|
||||
def find_string(self, target: str) -> Optional[int]:
|
||||
# Pad with null terminator to make sure we don't
|
||||
# match on a subset of the full string
|
||||
if not target.endswith(b"\x00"):
|
||||
target += b"\x00"
|
||||
|
||||
c = target[0]
|
||||
if c not in self._potential_strings:
|
||||
return None
|
||||
|
||||
for addr in self._potential_strings[c]:
|
||||
if target == self.read(addr, len(target)):
|
||||
return addr
|
||||
|
||||
return None
|
||||
|
||||
def is_relocated_addr(self, vaddr) -> bool:
|
||||
return vaddr in self._relocated_addrs
|
||||
|
||||
def _prepare_string_search(self):
|
||||
"""We are intersted in deduplicated string constants found in the
|
||||
.rdata and .data sections. For each relocated address in these sections,
|
||||
read the first byte and save the address if that byte is an ASCII character.
|
||||
When we search for an arbitrary string later, we can narrow down the list
|
||||
of potential locations by a lot."""
|
||||
|
||||
def is_ascii(b):
|
||||
return b" " <= b < b"\x7f"
|
||||
|
||||
sect_data = self.get_section_by_name(".data")
|
||||
sect_rdata = self.get_section_by_name(".rdata")
|
||||
potentials = filter(
|
||||
lambda a: sect_data.contains_vaddr(a) or sect_rdata.contains_vaddr(a),
|
||||
self.get_relocated_addresses(),
|
||||
)
|
||||
|
||||
for addr in potentials:
|
||||
c = self.read(addr, 1)
|
||||
if c is not None and is_ascii(c):
|
||||
k = ord(c)
|
||||
if k not in self._potential_strings:
|
||||
self._potential_strings[k] = set()
|
||||
|
||||
self._potential_strings[k].add(addr)
|
||||
|
||||
def _populate_relocations(self):
|
||||
"""The relocation table in .reloc gives each virtual address where the next four
|
||||
bytes are, itself, another virtual address. During loading, these values will be
|
||||
patched according to the virtual address space for the image, as provided by Windows.
|
||||
We can use this information to get a list of where each significant "thing"
|
||||
in the file is located. Anything that is referenced absolutely (i.e. excluding
|
||||
jump destinations given by local offset) will be here.
|
||||
One use case is to tell whether an immediate value in an operand represents
|
||||
a virtual address or just a big number."""
|
||||
|
||||
reloc = self.get_section_by_name(".reloc").view
|
||||
ofs = 0
|
||||
reloc_addrs = []
|
||||
|
||||
# Parse the structure in .reloc to get the list locations to check.
|
||||
# The first 8 bytes are 2 dwords that give the base page address
|
||||
# and the total block size (including this header).
|
||||
# The page address is used to compact the list; each entry is only
|
||||
# 2 bytes, and these are added to the base to get the full location.
|
||||
# If the entry read in is zero, we are at the end of this section and
|
||||
# these are padding bytes.
|
||||
while True:
|
||||
(page_base, block_size) = struct.unpack("<2I", reloc[ofs : ofs + 8])
|
||||
if block_size == 0:
|
||||
break
|
||||
|
||||
# HACK: ignore the relocation type for now (the top 4 bits of the value).
|
||||
values = list(struct.iter_unpack("<H", reloc[ofs + 8 : ofs + block_size]))
|
||||
reloc_addrs += [
|
||||
self.imagebase + page_base + (v[0] & 0xFFF) for v in values if v[0] != 0
|
||||
]
|
||||
|
||||
ofs += block_size
|
||||
|
||||
# We are now interested in the relocated addresses themselves. Seek to the
|
||||
# address where there is a relocation, then read the four bytes into our set.
|
||||
reloc_addrs.sort()
|
||||
self._relocations = set(reloc_addrs)
|
||||
|
||||
for section_id, offset in map(self.get_relative_addr, reloc_addrs):
|
||||
section = self.get_section_by_index(section_id)
|
||||
(relocated_addr,) = struct.unpack("<I", section.view[offset : offset + 4])
|
||||
self._relocated_addrs.add(relocated_addr)
|
||||
|
||||
def find_float_consts(self) -> Iterator[Tuple[int, int, float]]:
|
||||
"""Floating point instructions that refer to a memory address can
|
||||
point to constant values. Search the code sections to find FP
|
||||
instructions and check whether the pointer address refers to
|
||||
read-only data."""
|
||||
|
||||
# TODO: Should check any section that has code, not just .text
|
||||
text = self.get_section_by_name(".text")
|
||||
rdata = self.get_section_by_name(".rdata")
|
||||
|
||||
# These are the addresses where a relocation occurs.
|
||||
# Meaning: it points to an absolute address of something
|
||||
for addr in self._relocations:
|
||||
if not text.contains_vaddr(addr):
|
||||
continue
|
||||
|
||||
# Read the two bytes before the relocated address.
|
||||
# We will check against possible float opcodes
|
||||
raw = text.read_virtual(addr - 2, 6)
|
||||
(opcode, opcode_ext, const_addr) = struct.unpack("<BBL", raw)
|
||||
|
||||
# Skip right away if this is not const data
|
||||
if not rdata.contains_vaddr(const_addr):
|
||||
continue
|
||||
|
||||
if opcode_ext in (0x5, 0xD, 0x15, 0x1D, 0x25, 0x2D, 0x35, 0x3D):
|
||||
if opcode in (0xD8, 0xD9):
|
||||
# dword ptr -- single precision
|
||||
(float_value,) = struct.unpack("<f", self.read(const_addr, 4))
|
||||
yield (const_addr, 4, float_value)
|
||||
|
||||
elif opcode in (0xDC, 0xDD):
|
||||
# qword ptr -- double precision
|
||||
(float_value,) = struct.unpack("<d", self.read(const_addr, 8))
|
||||
yield (const_addr, 8, float_value)
|
||||
|
||||
def _populate_imports(self):
|
||||
"""Parse .idata to find imported DLLs and their functions."""
|
||||
idata_ofs = self.get_section_offset_by_name(".idata")
|
||||
|
||||
def iter_image_import():
|
||||
ofs = idata_ofs
|
||||
while True:
|
||||
# Read 5 dwords until all are zero.
|
||||
image_import_descriptor = struct.unpack("<5I", self.read(ofs, 20))
|
||||
ofs += 20
|
||||
if all(x == 0 for x in image_import_descriptor):
|
||||
break
|
||||
|
||||
(rva_ilt, _, __, dll_name, rva_iat) = image_import_descriptor
|
||||
# Convert relative virtual addresses into absolute
|
||||
yield (
|
||||
self.imagebase + rva_ilt,
|
||||
self.imagebase + dll_name,
|
||||
self.imagebase + rva_iat,
|
||||
)
|
||||
|
||||
image_import_descriptors = list(iter_image_import())
|
||||
|
||||
def iter_imports():
|
||||
# ILT = Import Lookup Table
|
||||
# IAT = Import Address Table
|
||||
# ILT gives us the symbol name of the import.
|
||||
# IAT gives the address. The compiler generated a thunk function
|
||||
# that jumps to the value of this address.
|
||||
for start_ilt, dll_addr, start_iat in image_import_descriptors:
|
||||
dll_name = self.read_string(dll_addr).decode("ascii")
|
||||
ofs_ilt = start_ilt
|
||||
# Address of "__imp__*" symbols.
|
||||
ofs_iat = start_iat
|
||||
while True:
|
||||
(lookup_addr,) = struct.unpack("<L", self.read(ofs_ilt, 4))
|
||||
(import_addr,) = struct.unpack("<L", self.read(ofs_iat, 4))
|
||||
if lookup_addr == 0 or import_addr == 0:
|
||||
break
|
||||
|
||||
# MSB set if this is an ordinal import
|
||||
if lookup_addr & 0x80000000 != 0:
|
||||
ordinal_num = lookup_addr & 0x7FFF
|
||||
symbol_name = f"Ordinal_{ordinal_num}"
|
||||
else:
|
||||
# Skip the "Hint" field, 2 bytes
|
||||
name_ofs = lookup_addr + self.imagebase + 2
|
||||
symbol_name = self.read_string(name_ofs).decode("ascii")
|
||||
|
||||
yield (dll_name, symbol_name, ofs_iat)
|
||||
ofs_ilt += 4
|
||||
ofs_iat += 4
|
||||
|
||||
self.imports = list(iter_imports())
|
||||
|
||||
def _populate_thunks(self):
|
||||
"""For each imported function, we generate a thunk function. The only
|
||||
instruction in the function is a jmp to the address in .idata.
|
||||
Search .text to find these functions."""
|
||||
|
||||
text_sect = self.get_section_by_name(".text")
|
||||
text_start = text_sect.virtual_address
|
||||
|
||||
# If this is a debug build, read the thunks at the start of .text
|
||||
# Terminated by a big block of 0xcc padding bytes before the first
|
||||
# real function in the section.
|
||||
if self.is_debug:
|
||||
ofs = 0
|
||||
while True:
|
||||
(opcode, operand) = struct.unpack("<Bi", text_sect.view[ofs : ofs + 5])
|
||||
if opcode != 0xE9:
|
||||
break
|
||||
|
||||
thunk_ofs = text_start + ofs
|
||||
jmp_ofs = text_start + ofs + 5 + operand
|
||||
self.thunks.append((thunk_ofs, jmp_ofs))
|
||||
ofs += 5
|
||||
|
||||
# Now check for import thunks which are present in debug and release.
|
||||
# These use an absolute JMP with the 2 byte opcode: 0xff 0x25
|
||||
idata_sect = self.get_section_by_name(".idata")
|
||||
ofs = text_start
|
||||
|
||||
for shift in (0, 2, 4):
|
||||
window = text_sect.view[shift:]
|
||||
win_end = 6 * (len(window) // 6)
|
||||
for i, (b0, b1, jmp_ofs) in enumerate(
|
||||
struct.iter_unpack("<2BL", window[:win_end])
|
||||
):
|
||||
if (b0, b1) == (0xFF, 0x25) and idata_sect.contains_vaddr(jmp_ofs):
|
||||
# Record the address of the jmp instruction and the destination in .idata
|
||||
thunk_ofs = ofs + shift + i * 6
|
||||
self.thunks.append((thunk_ofs, jmp_ofs))
|
||||
|
||||
def _populate_exports(self, export_rva: int, _: int):
|
||||
"""If you are missing a lot of annotations in your file
|
||||
(e.g. debug builds) then you can at least match up the
|
||||
export symbol names."""
|
||||
|
||||
# Null = no exports
|
||||
if export_rva == 0:
|
||||
return
|
||||
|
||||
export_start = self.imagebase + export_rva
|
||||
|
||||
# TODO: namedtuple
|
||||
export_table = struct.unpack("<2L2H7L", self.read(export_start, 40))
|
||||
|
||||
# TODO: if the number of functions doesn't match the number of names,
|
||||
# are the remaining functions ordinals?
|
||||
n_functions = export_table[6]
|
||||
|
||||
func_start = export_start + 40
|
||||
func_addrs = [
|
||||
self.imagebase + rva
|
||||
for rva, in struct.iter_unpack("<L", self.read(func_start, 4 * n_functions))
|
||||
]
|
||||
|
||||
name_start = func_start + 4 * n_functions
|
||||
name_addrs = [
|
||||
self.imagebase + rva
|
||||
for rva, in struct.iter_unpack("<L", self.read(name_start, 4 * n_functions))
|
||||
]
|
||||
|
||||
combined = zip(func_addrs, name_addrs)
|
||||
self.exports = [
|
||||
(func_addr, self.read_string(name_addr))
|
||||
for (func_addr, name_addr) in combined
|
||||
]
|
||||
|
||||
def iter_string(self, encoding: str = "ascii") -> Iterator[Tuple[int, str]]:
|
||||
"""Search for possible strings at each verified address in .data."""
|
||||
section = self.get_section_by_name(".data")
|
||||
for addr in self._relocated_addrs:
|
||||
if section.contains_vaddr(addr):
|
||||
raw = self.read_string(addr)
|
||||
if raw is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
string = raw.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
yield (addr, string)
|
||||
|
||||
def get_section_by_name(self, name: str) -> Section:
|
||||
section = next(
|
||||
filter(lambda section: section.match_name(name), self.sections),
|
||||
None,
|
||||
)
|
||||
|
||||
if section is None:
|
||||
raise SectionNotFoundError
|
||||
|
||||
return section
|
||||
|
||||
def get_section_by_index(self, index: int) -> Section:
|
||||
"""Convert 1-based index into 0-based."""
|
||||
return self.sections[index - 1]
|
||||
|
||||
def get_section_extent_by_index(self, index: int) -> int:
|
||||
return self.get_section_by_index(index).extent
|
||||
|
||||
def get_section_offset_by_index(self, index: int) -> int:
|
||||
"""The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
|
||||
where A is the index (1-based) into the section table and B is the local offset.
|
||||
This will return the virtual address for the start of the section at the given index
|
||||
so you can get the virtual address for whatever symbol you are looking at.
|
||||
"""
|
||||
return self.get_section_by_index(index).virtual_address
|
||||
|
||||
def get_section_offset_by_name(self, name: str) -> int:
|
||||
"""Same as above, but use the section name as the lookup"""
|
||||
|
||||
section = self.get_section_by_name(name)
|
||||
return section.virtual_address
|
||||
|
||||
def get_abs_addr(self, section: int, offset: int) -> int:
|
||||
"""Convenience function for converting section:offset pairs from cvdump
|
||||
into an absolute vaddr."""
|
||||
return self.get_section_offset_by_index(section) + offset
|
||||
|
||||
def get_relative_addr(self, addr: int) -> Tuple[int, int]:
|
||||
"""Convert an absolute address back into a (section, offset) pair."""
|
||||
i = bisect.bisect_right(self._section_vaddr, addr) - 1
|
||||
i = max(0, i)
|
||||
|
||||
section = self.sections[i]
|
||||
if section.contains_vaddr(addr):
|
||||
return (i + 1, addr - section.virtual_address)
|
||||
|
||||
raise InvalidVirtualAddressError(f"{self.filename} : {hex(addr)}")
|
||||
|
||||
def is_valid_section(self, section_id: int) -> bool:
|
||||
"""The PDB will refer to sections that are not listed in the headers
|
||||
and so should ignore these references."""
|
||||
try:
|
||||
_ = self.get_section_by_index(section_id)
|
||||
return True
|
||||
except IndexError:
|
||||
return False
|
||||
|
||||
def is_valid_vaddr(self, vaddr: int) -> bool:
|
||||
"""Does this virtual address point to anything in the exe?"""
|
||||
try:
|
||||
(_, __) = self.get_relative_addr(vaddr)
|
||||
except InvalidVirtualAddressError:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def read_string(self, offset: int, chunk_size: int = 1000) -> Optional[bytes]:
|
||||
"""Read until we find a zero byte."""
|
||||
b = self.read(offset, chunk_size)
|
||||
if b is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
return b[: b.index(b"\x00")]
|
||||
except ValueError:
|
||||
# No terminator found, just return what we have
|
||||
return b
|
||||
|
||||
def read(self, vaddr: int, size: int) -> Optional[bytes]:
|
||||
"""Read (at most) the given number of bytes at the given virtual address.
|
||||
If we return None, the given address points to uninitialized data."""
|
||||
(section_id, offset) = self.get_relative_addr(vaddr)
|
||||
section = self.sections[section_id - 1]
|
||||
|
||||
if section.addr_is_uninitialized(vaddr):
|
||||
return None
|
||||
|
||||
# Clamp the read within the extent of the current section.
|
||||
# Reading off the end will most likely misrepresent the virtual addressing.
|
||||
_size = min(size, section.size_of_raw_data - offset)
|
||||
return bytes(section.view[offset : offset + _size])
|
@ -1 +0,0 @@
|
||||
from .core import Compare
|
@ -1,2 +0,0 @@
|
||||
from .parse import ParseAsm
|
||||
from .swap import can_resolve_register_differences
|
@ -1,27 +0,0 @@
|
||||
# Duplicates removed, according to the mnemonics capstone uses.
|
||||
# e.g. je and jz are the same instruction. capstone uses je.
|
||||
# See: /arch/X86/X86GenAsmWriter.inc in the capstone repo.
|
||||
JUMP_MNEMONICS = {
|
||||
"ja",
|
||||
"jae",
|
||||
"jb",
|
||||
"jbe",
|
||||
"jcxz", # unused?
|
||||
"je",
|
||||
"jecxz",
|
||||
"jg",
|
||||
"jge",
|
||||
"jl",
|
||||
"jle",
|
||||
"jmp",
|
||||
"jne",
|
||||
"jno",
|
||||
"jnp",
|
||||
"jns",
|
||||
"jo",
|
||||
"jp",
|
||||
"js",
|
||||
}
|
||||
|
||||
# Guaranteed to be a single operand.
|
||||
SINGLE_OPERAND_INSTS = {"push", "call", *JUMP_MNEMONICS}
|
@ -1,314 +0,0 @@
|
||||
import re
|
||||
from typing import List, Tuple, Set
|
||||
|
||||
DiffOpcode = Tuple[str, int, int, int, int]
|
||||
|
||||
REG_FIND = re.compile(r"(?: |\[)(e?[a-d]x|e?[s,d]i|[a-d][l,h]|e?[b,s]p)")
|
||||
|
||||
ALLOWED_JUMP_SWAPS = (
|
||||
("ja", "jb"),
|
||||
("jae", "jbe"),
|
||||
("jb", "ja"),
|
||||
("jbe", "jae"),
|
||||
("jg", "jl"),
|
||||
("jge", "jle"),
|
||||
("jl", "jg"),
|
||||
("jle", "jge"),
|
||||
("je", "je"),
|
||||
("jne", "jne"),
|
||||
)
|
||||
|
||||
|
||||
def jump_swap_ok(a: str, b: str) -> bool:
|
||||
"""For the instructions a,b, are they both jump instructions
|
||||
that are compatible with a swapped cmp operand order?"""
|
||||
# Grab the mnemonic
|
||||
(jmp_a, _, __) = a.partition(" ")
|
||||
(jmp_b, _, __) = b.partition(" ")
|
||||
|
||||
return (jmp_a, jmp_b) in ALLOWED_JUMP_SWAPS
|
||||
|
||||
|
||||
def is_operand_swap(a: str, b: str) -> bool:
|
||||
"""This is a hack to avoid parsing the operands. It's not as simple as
|
||||
breaking on the comma because templates or string literals interfere
|
||||
with this. Instead we check:
|
||||
1. Do both strings use the exact same set of characters?
|
||||
2. If we do break on ', ', is the first token of each different?
|
||||
2 is needed to catch an edge case like:
|
||||
cmp eax, dword ptr [ecx + 0x1234]
|
||||
cmp ecx, dword ptr [eax + 0x1234]
|
||||
"""
|
||||
return a.partition(", ")[0] != b.partition(", ")[0] and sorted(a) == sorted(b)
|
||||
|
||||
|
||||
def can_cmp_swap(orig: List[str], recomp: List[str]) -> bool:
|
||||
# Make sure we have 1 cmp and 1 jmp for both
|
||||
if len(orig) != 2 or len(recomp) != 2:
|
||||
return False
|
||||
|
||||
if not orig[0].startswith("cmp") or not recomp[0].startswith("cmp"):
|
||||
return False
|
||||
|
||||
if not orig[1].startswith("j") or not recomp[1].startswith("j"):
|
||||
return False
|
||||
|
||||
# Checking two things:
|
||||
# Are the cmp operands flipped?
|
||||
# Is the jump instruction compatible with a flip?
|
||||
return is_operand_swap(orig[0], recomp[0]) and jump_swap_ok(orig[1], recomp[1])
|
||||
|
||||
|
||||
def patch_jump(a: str, b: str) -> str:
|
||||
"""For jump instructions a, b, return `(mnemonic_a) (operand_b)`.
|
||||
The reason to do it this way (instead of just returning `a`) is that
|
||||
the jump instructions might use different displacement offsets
|
||||
or labels. If we just replace `b` with `a`, this diff would be
|
||||
incorrectly eliminated."""
|
||||
(mnemonic_a, _, __) = a.partition(" ")
|
||||
(_, __, operand_b) = b.partition(" ")
|
||||
|
||||
return mnemonic_a + " " + operand_b
|
||||
|
||||
|
||||
def patch_cmp_swaps(
|
||||
codes: List[DiffOpcode], orig_asm: List[str], recomp_asm: List[str]
|
||||
) -> Set[int]:
|
||||
"""Can we resolve the diffs between orig and recomp by patching
|
||||
swapped cmp instructions?
|
||||
For example:
|
||||
cmp eax, ebx cmp ebx, eax
|
||||
je .label je .label
|
||||
|
||||
cmp eax, ebx cmp ebx, eax
|
||||
ja .label jb .label
|
||||
"""
|
||||
|
||||
fixed_lines = set()
|
||||
|
||||
for code, i1, i2, j1, j2 in codes:
|
||||
# To save us the trouble of finding "compatible" cmp instructions
|
||||
# use the diff information we already have.
|
||||
if code != "replace":
|
||||
continue
|
||||
|
||||
# If the ranges in orig and recomp are not equal, use the shorter one
|
||||
for i, j in zip(range(i1, i2), range(j1, j2)):
|
||||
if can_cmp_swap(orig_asm[i : i + 2], recomp_asm[j : j + 2]):
|
||||
# Patch cmp
|
||||
fixed_lines.add(j)
|
||||
|
||||
# Patch the jump if necessary
|
||||
patched = patch_jump(orig_asm[i + 1], recomp_asm[j + 1])
|
||||
# We only register a fix if it actually matches
|
||||
if orig_asm[i + 1] == patched:
|
||||
fixed_lines.add(j + 1)
|
||||
|
||||
return fixed_lines
|
||||
|
||||
|
||||
def effective_match_possible(orig_asm: List[str], recomp_asm: List[str]) -> bool:
|
||||
# We can only declare an effective match based on the text
|
||||
# so you need the same amount of "stuff" in each
|
||||
if len(orig_asm) != len(recomp_asm):
|
||||
return False
|
||||
|
||||
# mnemonic_orig = [inst.partition(" ")[0] for inst in orig_asm]
|
||||
# mnemonic_recomp = [inst.partition(" ")[0] for inst in recomp_asm]
|
||||
|
||||
# Cannot change mnemonics. Must be same starting list
|
||||
# TODO: Fine idea but this will exclude jump swaps for cmp operand order
|
||||
# if sorted(mnemonic_orig) != sorted(mnemonic_recomp):
|
||||
# return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def find_regs_used(inst: str) -> List[str]:
|
||||
return REG_FIND.findall(inst)
|
||||
|
||||
|
||||
def find_regs_changed(a: str, b: str) -> List[Tuple[str, str]]:
|
||||
"""For instructions a, b, return the pairs of registers that were used.
|
||||
This is not a very precise way to compare the instructions, so it depends
|
||||
on the input being two instructions that would match *except* for
|
||||
the register choice."""
|
||||
return zip(REG_FIND.findall(a), REG_FIND.findall(b))
|
||||
|
||||
|
||||
def bad_register_swaps(
|
||||
swaps: Set[int], orig_asm: List[str], recomp_asm: List[str]
|
||||
) -> Set[int]:
|
||||
"""The list of recomp indices in `swaps` tells which instructions are
|
||||
a match for orig except for the registers used. From that list, check
|
||||
whether a register swap should not be allowed.
|
||||
For now, this means checking for `push` instructions where the register
|
||||
was not used in any other register swaps on previous instructions."""
|
||||
rejects = set()
|
||||
|
||||
# Foreach `push` instruction where we have excused the diff
|
||||
pushes = [j for j in swaps if recomp_asm[j].startswith("push")]
|
||||
|
||||
for j in pushes:
|
||||
okay = False
|
||||
# Get the operands in each
|
||||
reg = (orig_asm[j].partition(" ")[2], recomp_asm[j].partition(" ")[2])
|
||||
# If this isn't a register at all, ignore it
|
||||
try:
|
||||
int(reg[0], 16)
|
||||
continue
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# For every other excused diff that is *not* a push:
|
||||
# Assumes same index in orig as in recomp, but so does our naive match
|
||||
for k in swaps.difference(pushes):
|
||||
changed_regs = find_regs_changed(orig_asm[k], recomp_asm[k])
|
||||
if reg in changed_regs or reg[::-1] in changed_regs:
|
||||
okay = True
|
||||
break
|
||||
|
||||
if not okay:
|
||||
rejects.add(j)
|
||||
|
||||
return rejects
|
||||
|
||||
|
||||
# Instructions that result in a change to the first operand
|
||||
MODIFIER_INSTRUCTIONS = ("adc", "add", "lea", "mov", "neg", "sbb", "sub", "pop", "xor")
|
||||
|
||||
|
||||
def instruction_alters_regs(inst: str, regs: Set[str]) -> bool:
|
||||
(mnemonic, _, op_str) = inst.partition(" ")
|
||||
(first_operand, _, __) = op_str.partition(", ")
|
||||
|
||||
return (mnemonic in MODIFIER_INSTRUCTIONS and first_operand in regs) or (
|
||||
mnemonic == "call" and "eax" in regs
|
||||
)
|
||||
|
||||
|
||||
def relocate_instructions(
|
||||
codes: List[DiffOpcode], orig_asm: List[str], recomp_asm: List[str]
|
||||
) -> Set[int]:
|
||||
"""Collect the list of instructions deleted from orig and inserted
|
||||
into recomp, according to the diff opcodes. Using this list, match up
|
||||
any pairs of instructions that we assume to be relocated and return
|
||||
the indices in recomp where this has occurred.
|
||||
For now, we are checking only for an exact match on the instruction.
|
||||
We are not checking whether the given instruction can be moved from
|
||||
point A to B. (i.e. does this set a register that is used by the
|
||||
instructions between A and B?)"""
|
||||
deletes = {
|
||||
i for code, i1, i2, _, __ in codes for i in range(i1, i2) if code == "delete"
|
||||
}
|
||||
inserts = [
|
||||
j for code, _, __, j1, j2 in codes for j in range(j1, j2) if code == "insert"
|
||||
]
|
||||
|
||||
relocated = set()
|
||||
|
||||
for j in inserts:
|
||||
line = recomp_asm[j]
|
||||
recomp_regs_used = set(find_regs_used(line))
|
||||
for i in deletes:
|
||||
# Check for exact match.
|
||||
# TODO: This will grab the first instruction that matches.
|
||||
# We should probably use the nearest index instead, if it matters
|
||||
if orig_asm[i] == line:
|
||||
# To account for a move in either direction
|
||||
reloc_start = min(i, j)
|
||||
reloc_end = max(i, j)
|
||||
if not any(
|
||||
instruction_alters_regs(orig_asm[k], recomp_regs_used)
|
||||
for k in range(reloc_start, reloc_end)
|
||||
):
|
||||
relocated.add(j)
|
||||
deletes.remove(i)
|
||||
break
|
||||
|
||||
return relocated
|
||||
|
||||
|
||||
DWORD_REGS = ("eax", "ebx", "ecx", "edx", "esi", "edi", "ebp", "esp")
|
||||
WORD_REGS = ("ax", "bx", "cx", "dx", "si", "di", "bp", "sp")
|
||||
BYTE_REGS = ("ah", "al", "bh", "bl", "ch", "cl", "dh", "dl")
|
||||
|
||||
|
||||
def naive_register_replacement(orig_asm: List[str], recomp_asm: List[str]) -> Set[int]:
|
||||
"""Replace all registers of the same size with a placeholder string.
|
||||
After doing that, compare orig and recomp again.
|
||||
Return indices from recomp that are now equal to the same index in orig.
|
||||
This requires orig and recomp to have the same number of instructions,
|
||||
but this is already a requirement for effective match."""
|
||||
orig_raw = "\n".join(orig_asm)
|
||||
recomp_raw = "\n".join(recomp_asm)
|
||||
|
||||
# TODO: hardly the most elegant way to do this.
|
||||
for rdw in DWORD_REGS:
|
||||
orig_raw = orig_raw.replace(rdw, "~reg4")
|
||||
recomp_raw = recomp_raw.replace(rdw, "~reg4")
|
||||
|
||||
for rw in WORD_REGS:
|
||||
orig_raw = orig_raw.replace(rw, "~reg2")
|
||||
recomp_raw = recomp_raw.replace(rw, "~reg2")
|
||||
|
||||
for rb in BYTE_REGS:
|
||||
orig_raw = orig_raw.replace(rb, "~reg1")
|
||||
recomp_raw = recomp_raw.replace(rb, "~reg1")
|
||||
|
||||
orig_scrubbed = orig_raw.split("\n")
|
||||
recomp_scrubbed = recomp_raw.split("\n")
|
||||
|
||||
return {
|
||||
j for j in range(len(recomp_scrubbed)) if orig_scrubbed[j] == recomp_scrubbed[j]
|
||||
}
|
||||
|
||||
|
||||
def find_effective_match(
|
||||
codes: List[DiffOpcode], orig_asm: List[str], recomp_asm: List[str]
|
||||
) -> bool:
|
||||
"""Check whether the two sequences of instructions are an effective match.
|
||||
Meaning: do they differ only by instruction order or register selection?"""
|
||||
if not effective_match_possible(orig_asm, recomp_asm):
|
||||
return False
|
||||
|
||||
already_equal = {
|
||||
j for code, _, __, j1, j2 in codes for j in range(j1, j2) if code == "equal"
|
||||
}
|
||||
|
||||
# We need to come up with some answer for each of these lines
|
||||
recomp_lines_disputed = {
|
||||
j
|
||||
for code, _, __, j1, j2 in codes
|
||||
for j in range(j1, j2)
|
||||
if code in ("insert", "replace")
|
||||
}
|
||||
|
||||
cmp_swaps = patch_cmp_swaps(codes, orig_asm, recomp_asm)
|
||||
# This naive result includes lines that already match, so remove those
|
||||
naive_swaps = naive_register_replacement(orig_asm, recomp_asm).difference(
|
||||
already_equal
|
||||
)
|
||||
relocates = relocate_instructions(codes, orig_asm, recomp_asm)
|
||||
|
||||
bad_swaps = bad_register_swaps(naive_swaps, orig_asm, recomp_asm)
|
||||
|
||||
corrections = set().union(
|
||||
naive_swaps.difference(bad_swaps),
|
||||
cmp_swaps,
|
||||
relocates,
|
||||
)
|
||||
|
||||
return corrections.issuperset(recomp_lines_disputed)
|
||||
|
||||
|
||||
def assert_fixup(asm: List[Tuple[str, str]]):
|
||||
"""Detect assert calls and replace the code filename and line number
|
||||
values with macros (from assert.h)."""
|
||||
for i, (_, line) in enumerate(asm):
|
||||
if "_assert" in line and line.startswith("call"):
|
||||
try:
|
||||
asm[i - 3] = (asm[i - 3][0], "push __LINE__")
|
||||
asm[i - 2] = (asm[i - 2][0], "push __FILE__")
|
||||
except IndexError:
|
||||
continue
|
@ -1,249 +0,0 @@
|
||||
"""Pre-parser for x86 instructions. Will identify data/jump tables used with
|
||||
switch statements and local jump/call destinations."""
|
||||
import re
|
||||
import bisect
|
||||
import struct
|
||||
from enum import Enum, auto
|
||||
from collections import namedtuple
|
||||
from typing import Iterable, List, NamedTuple, Optional, Tuple, Union
|
||||
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
|
||||
from .const import JUMP_MNEMONICS
|
||||
|
||||
disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
|
||||
|
||||
DisasmLiteTuple = Tuple[int, int, str, str]
|
||||
DisasmLiteInst = namedtuple("DisasmLiteInst", "address, size, mnemonic, op_str")
|
||||
|
||||
displacement_regex = re.compile(r".*\+ (0x[0-9a-f]+)\]")
|
||||
|
||||
|
||||
class SectionType(Enum):
|
||||
CODE = auto()
|
||||
DATA_TAB = auto()
|
||||
ADDR_TAB = auto()
|
||||
|
||||
|
||||
class FuncSection(NamedTuple):
|
||||
type: SectionType
|
||||
contents: List[Union[DisasmLiteInst, Tuple[str, int]]]
|
||||
|
||||
|
||||
def stop_at_int3(
|
||||
disasm_lite_gen: Iterable[DisasmLiteTuple],
|
||||
) -> Iterable[DisasmLiteTuple]:
|
||||
"""Wrapper for capstone disasm_lite generator. We want to stop reading
|
||||
instructions if we hit the int3 instruction."""
|
||||
for inst in disasm_lite_gen:
|
||||
# inst[2] is the mnemonic
|
||||
if inst[2] == "int3":
|
||||
break
|
||||
|
||||
yield inst
|
||||
|
||||
|
||||
class InstructGen:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
def __init__(self, blob: bytes, start: int) -> None:
|
||||
self.blob = blob
|
||||
self.start = start
|
||||
self.end = len(blob) + start
|
||||
self.section_end: int = self.end
|
||||
self.code_tracks: List[List[DisasmLiteInst]] = []
|
||||
|
||||
# Todo: Could be refactored later
|
||||
self.cur_addr: int = 0
|
||||
self.cur_section_type: SectionType = SectionType.CODE
|
||||
self.section_start = start
|
||||
|
||||
self.sections: List[FuncSection] = []
|
||||
|
||||
self.confirmed_addrs = {}
|
||||
self.analysis()
|
||||
|
||||
def _finish_section(self, type_: SectionType, stuff):
|
||||
sect = FuncSection(type_, stuff)
|
||||
self.sections.append(sect)
|
||||
|
||||
def _insert_confirmed_addr(self, addr: int, type_: SectionType):
|
||||
# Ignore address outside the bounds of the function
|
||||
if not self.start <= addr < self.end:
|
||||
return
|
||||
|
||||
self.confirmed_addrs[addr] = type_
|
||||
|
||||
# This newly inserted address might signal the end of this section.
|
||||
# For example, a jump table at the end of the function means we should
|
||||
# stop reading instructions once we hit that address.
|
||||
# However, if there is a jump table in between code sections, we might
|
||||
# read a jump to an address back to the beginning of the function
|
||||
# (e.g. a loop that spans the entire function)
|
||||
# so ignore this address because we have already passed it.
|
||||
if type_ != self.cur_section_type and addr > self.cur_addr:
|
||||
self.section_end = min(self.section_end, addr)
|
||||
|
||||
def _next_section(self, addr: int) -> Optional[SectionType]:
|
||||
"""We have reached the start of a new section. Tell what kind of
|
||||
data we are looking at (code or other) and how much we should read."""
|
||||
|
||||
# Assume the start of every function is code.
|
||||
if addr == self.start:
|
||||
self.section_end = self.end
|
||||
return SectionType.CODE
|
||||
|
||||
# The start of a new section must be an address that we've seen.
|
||||
new_type = self.confirmed_addrs.get(addr)
|
||||
if new_type is None:
|
||||
return None
|
||||
|
||||
self.cur_section_type = new_type
|
||||
|
||||
# The confirmed addrs dict is sorted by insertion order
|
||||
# i.e. the order in which we read the addresses
|
||||
# So we have to sort and then find the next item
|
||||
# to see where this section should end.
|
||||
|
||||
# If we are in a CODE section, ignore contiguous CODE addresses.
|
||||
# These are not the start of a new section.
|
||||
# However: if we are not in CODE, any upcoming address is a new section.
|
||||
# Do this so we can detect contiguous non-CODE sections.
|
||||
confirmed = [
|
||||
conf_addr
|
||||
for (conf_addr, conf_type) in sorted(self.confirmed_addrs.items())
|
||||
if self.cur_section_type != SectionType.CODE
|
||||
or conf_type != self.cur_section_type
|
||||
]
|
||||
|
||||
index = bisect.bisect_right(confirmed, addr)
|
||||
if index < len(confirmed):
|
||||
self.section_end = confirmed[index]
|
||||
else:
|
||||
self.section_end = self.end
|
||||
|
||||
return new_type
|
||||
|
||||
def _get_code_for(self, addr: int) -> List[DisasmLiteInst]:
|
||||
"""Start disassembling at the given address."""
|
||||
# If we are reading a code block beyond the first, see if we already
|
||||
# have disassembled instructions beginning at the specified address.
|
||||
# For a CODE/ADDR/CODE function, we might get lucky and produce the
|
||||
# correct instruction after the jump table's junk instructions.
|
||||
for track in self.code_tracks:
|
||||
for i, inst in enumerate(track):
|
||||
if inst.address == addr:
|
||||
return track[i:]
|
||||
|
||||
# If we are here, we don't have the instructions.
|
||||
# Todo: Could try to be clever here and disassemble only
|
||||
# as much as we probably need (i.e. if a jump table is between CODE
|
||||
# blocks, there are probably only a few bad instructions after the
|
||||
# jump table is finished. We could disassemble up to the next verified
|
||||
# code address and stitch it together)
|
||||
|
||||
blob_cropped = self.blob[addr - self.start :]
|
||||
instructions = [
|
||||
DisasmLiteInst(*inst)
|
||||
for inst in stop_at_int3(disassembler.disasm_lite(blob_cropped, addr))
|
||||
]
|
||||
self.code_tracks.append(instructions)
|
||||
return instructions
|
||||
|
||||
def _handle_jump(self, inst: DisasmLiteInst):
|
||||
# If this is a regular jump and its destination is within the
|
||||
# bounds of the binary data (i.e. presumed function size)
|
||||
# add it to our list of confirmed addresses.
|
||||
if inst.op_str[0] == "0":
|
||||
value = int(inst.op_str, 16)
|
||||
self._insert_confirmed_addr(value, SectionType.CODE)
|
||||
|
||||
# If this is jumping into a table of addresses, save the destination
|
||||
elif (match := displacement_regex.match(inst.op_str)) is not None:
|
||||
value = int(match.group(1), 16)
|
||||
self._insert_confirmed_addr(value, SectionType.ADDR_TAB)
|
||||
|
||||
def analysis(self):
|
||||
self.cur_addr = self.start
|
||||
|
||||
while (sect_type := self._next_section(self.cur_addr)) is not None:
|
||||
self.section_start = self.cur_addr
|
||||
|
||||
if sect_type == SectionType.CODE:
|
||||
instructions = self._get_code_for(self.cur_addr)
|
||||
|
||||
# If we didn't get any instructions back, something is wrong.
|
||||
# i.e. We can only read part of the full instruction that is up next.
|
||||
if len(instructions) == 0:
|
||||
# Nudge the current addr so we will eventually move on to the
|
||||
# next section.
|
||||
# Todo: Maybe we could just call it quits here
|
||||
self.cur_addr += 1
|
||||
break
|
||||
|
||||
for inst in instructions:
|
||||
# section_end is updated as we read instructions.
|
||||
# If we are into a jump/data table and would read
|
||||
# a junk instruction, stop here.
|
||||
if self.cur_addr >= self.section_end:
|
||||
break
|
||||
|
||||
# print(f"{inst.address:x} : {inst.mnemonic} {inst.op_str}")
|
||||
|
||||
if inst.mnemonic in JUMP_MNEMONICS:
|
||||
self._handle_jump(inst)
|
||||
# Todo: log calls too (unwind section)
|
||||
elif inst.mnemonic == "mov":
|
||||
# Todo: maintain pairing of data/jump tables
|
||||
if (match := displacement_regex.match(inst.op_str)) is not None:
|
||||
value = int(match.group(1), 16)
|
||||
self._insert_confirmed_addr(value, SectionType.DATA_TAB)
|
||||
|
||||
# Do this instead of copying instruction address.
|
||||
# If there is only one instruction, we would get stuck here.
|
||||
self.cur_addr += inst.size
|
||||
|
||||
# End of for loop on instructions.
|
||||
# We are at the end of the section or the entire function.
|
||||
# Cut out only the valid instructions for this section
|
||||
# and save it for later.
|
||||
|
||||
# Todo: don't need to iter on every instruction here.
|
||||
# They are already in order.
|
||||
instruction_slice = [
|
||||
inst for inst in instructions if inst.address < self.section_end
|
||||
]
|
||||
self._finish_section(SectionType.CODE, instruction_slice)
|
||||
|
||||
elif sect_type == SectionType.ADDR_TAB:
|
||||
# Clamp to multiple of 4 (dwords)
|
||||
read_size = ((self.section_end - self.cur_addr) // 4) * 4
|
||||
offsets = range(self.section_start, self.section_start + read_size, 4)
|
||||
dwords = self.blob[
|
||||
self.cur_addr - self.start : self.cur_addr - self.start + read_size
|
||||
]
|
||||
addrs = [addr for addr, in struct.iter_unpack("<L", dwords)]
|
||||
for addr in addrs:
|
||||
# Todo: the fact that these are jump table destinations
|
||||
# should factor into the label name.
|
||||
self._insert_confirmed_addr(addr, SectionType.CODE)
|
||||
|
||||
jump_table = list(zip(offsets, addrs))
|
||||
# for (t0,t1) in jump_table:
|
||||
# print(f"{t0:x} : --> {t1:x}")
|
||||
|
||||
self._finish_section(SectionType.ADDR_TAB, jump_table)
|
||||
self.cur_addr = self.section_end
|
||||
|
||||
else:
|
||||
# Todo: variable data size?
|
||||
read_size = self.section_end - self.cur_addr
|
||||
offsets = range(self.section_start, self.section_start + read_size)
|
||||
bytes_ = self.blob[
|
||||
self.cur_addr - self.start : self.cur_addr - self.start + read_size
|
||||
]
|
||||
data = [b for b, in struct.iter_unpack("<B", bytes_)]
|
||||
|
||||
data_table = list(zip(offsets, data))
|
||||
# for (t0,t1) in data_table:
|
||||
# print(f"{t0:x} : value {t1:02x}")
|
||||
|
||||
self._finish_section(SectionType.DATA_TAB, data_table)
|
||||
self.cur_addr = self.section_end
|
@ -1,243 +0,0 @@
|
||||
"""Converts x86 machine code into text (i.e. assembly). The end goal is to
|
||||
compare the code in the original and recomp binaries, using longest common
|
||||
subsequence (LCS), i.e. difflib.SequenceMatcher.
|
||||
The capstone library takes the raw bytes and gives us the mnemonic
|
||||
and operand(s) for each instruction. We need to "sanitize" the text further
|
||||
so that virtual addresses are replaced by symbol name or a generic
|
||||
placeholder string."""
|
||||
|
||||
import re
|
||||
import struct
|
||||
from functools import cache
|
||||
from typing import Callable, List, Optional, Tuple
|
||||
from collections import namedtuple
|
||||
from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS
|
||||
from .instgen import InstructGen, SectionType
|
||||
|
||||
ptr_replace_regex = re.compile(r"\[(0x[0-9a-f]+)\]")
|
||||
|
||||
displace_replace_regex = re.compile(r"\+ (0x[0-9a-f]+)\]")
|
||||
|
||||
# For matching an immediate value on its own.
|
||||
# Preceded by start-of-string (first operand) or comma-space (second operand)
|
||||
immediate_replace_regex = re.compile(r"(?:^|, )(0x[0-9a-f]+)")
|
||||
|
||||
DisasmLiteInst = namedtuple("DisasmLiteInst", "address, size, mnemonic, op_str")
|
||||
|
||||
|
||||
@cache
|
||||
def from_hex(string: str) -> Optional[int]:
|
||||
try:
|
||||
return int(string, 16)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def bytes_to_dword(b: bytes) -> Optional[int]:
|
||||
if len(b) == 4:
|
||||
return struct.unpack("<L", b)[0]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class ParseAsm:
|
||||
def __init__(
|
||||
self,
|
||||
relocate_lookup: Optional[Callable[[int], bool]] = None,
|
||||
name_lookup: Optional[Callable[[int, bool], str]] = None,
|
||||
bin_lookup: Optional[Callable[[int, int], Optional[bytes]]] = None,
|
||||
) -> None:
|
||||
self.relocate_lookup = relocate_lookup
|
||||
self.name_lookup = name_lookup
|
||||
self.bin_lookup = bin_lookup
|
||||
self.replacements = {}
|
||||
self.number_placeholders = True
|
||||
|
||||
def reset(self):
|
||||
self.replacements = {}
|
||||
|
||||
def is_relocated(self, addr: int) -> bool:
|
||||
if callable(self.relocate_lookup):
|
||||
return self.relocate_lookup(addr)
|
||||
|
||||
return False
|
||||
|
||||
def lookup(
|
||||
self, addr: int, use_cache: bool = True, exact: bool = False
|
||||
) -> Optional[str]:
|
||||
"""Return a replacement name for this address if we find one."""
|
||||
if use_cache and (cached := self.replacements.get(addr, None)) is not None:
|
||||
return cached
|
||||
|
||||
if callable(self.name_lookup):
|
||||
if (name := self.name_lookup(addr, exact)) is not None:
|
||||
if use_cache:
|
||||
self.replacements[addr] = name
|
||||
|
||||
return name
|
||||
|
||||
return None
|
||||
|
||||
def replace(self, addr: int) -> str:
|
||||
"""Same function as lookup above, but here we return a placeholder
|
||||
if there is no better name to use."""
|
||||
if (name := self.lookup(addr)) is not None:
|
||||
return name
|
||||
|
||||
# The placeholder number corresponds to the number of addresses we have
|
||||
# already replaced. This is so the number will be consistent across the diff
|
||||
# if we can replace some symbols with actual names in recomp but not orig.
|
||||
idx = len(self.replacements) + 1
|
||||
placeholder = f"<OFFSET{idx}>" if self.number_placeholders else "<OFFSET>"
|
||||
self.replacements[addr] = placeholder
|
||||
return placeholder
|
||||
|
||||
def hex_replace_always(self, match: re.Match) -> str:
|
||||
"""If a pointer value was matched, always insert a placeholder"""
|
||||
value = int(match.group(1), 16)
|
||||
return match.group(0).replace(match.group(1), self.replace(value))
|
||||
|
||||
def hex_replace_relocated(self, match: re.Match) -> str:
|
||||
"""For replacing immediate value operands. We only want to
|
||||
use the placeholder if we are certain that this is a valid address.
|
||||
We can check the relocation table to find out."""
|
||||
value = int(match.group(1), 16)
|
||||
if self.is_relocated(value):
|
||||
return match.group(0).replace(match.group(1), self.replace(value))
|
||||
|
||||
return match.group(0)
|
||||
|
||||
def hex_replace_annotated(self, match: re.Match) -> str:
|
||||
"""For replacing immediate value operands. Here we replace the value
|
||||
only if the name lookup returns something. Do not use a placeholder."""
|
||||
value = int(match.group(1), 16)
|
||||
placeholder = self.lookup(value, use_cache=False)
|
||||
if placeholder is not None:
|
||||
return match.group(0).replace(match.group(1), placeholder)
|
||||
|
||||
return match.group(0)
|
||||
|
||||
def hex_replace_indirect(self, match: re.Match) -> str:
|
||||
"""Edge case for hex_replace_always. The context of the instruction
|
||||
tells us that the pointer value is an absolute indirect.
|
||||
So we go to that location in the binary to get the address.
|
||||
If we cannot identify the indirect address, fall back to a lookup
|
||||
on the original pointer value so we might display something useful."""
|
||||
value = int(match.group(1), 16)
|
||||
indirect_value = None
|
||||
|
||||
if callable(self.bin_lookup):
|
||||
indirect_value = self.bin_lookup(value, 4)
|
||||
|
||||
if indirect_value is not None:
|
||||
indirect_addr = bytes_to_dword(indirect_value)
|
||||
if (
|
||||
indirect_addr is not None
|
||||
and self.lookup(indirect_addr, use_cache=False) is not None
|
||||
):
|
||||
return match.group(0).replace(
|
||||
match.group(1), "->" + self.replace(indirect_addr)
|
||||
)
|
||||
|
||||
return match.group(0).replace(match.group(1), self.replace(value))
|
||||
|
||||
def sanitize(self, inst: DisasmLiteInst) -> Tuple[str, str]:
|
||||
# For jumps or calls, if the entire op_str is a hex number, the value
|
||||
# is a relative offset.
|
||||
# Otherwise (i.e. it looks like `dword ptr [address]`) it is an
|
||||
# absolute indirect that we will handle below.
|
||||
# Providing the starting address of the function to capstone.disasm has
|
||||
# automatically resolved relative offsets to an absolute address.
|
||||
# We will have to undo this for some of the jumps or they will not match.
|
||||
|
||||
if (
|
||||
inst.mnemonic in SINGLE_OPERAND_INSTS
|
||||
and (op_str_address := from_hex(inst.op_str)) is not None
|
||||
):
|
||||
if inst.mnemonic == "call":
|
||||
return (inst.mnemonic, self.replace(op_str_address))
|
||||
|
||||
if inst.mnemonic == "push":
|
||||
if self.is_relocated(op_str_address):
|
||||
return (inst.mnemonic, self.replace(op_str_address))
|
||||
|
||||
# To avoid falling into jump handling
|
||||
return (inst.mnemonic, inst.op_str)
|
||||
|
||||
if inst.mnemonic == "jmp":
|
||||
# The unwind section contains JMPs to other functions.
|
||||
# If we have a name for this address, use it. If not,
|
||||
# do not create a new placeholder. We will instead
|
||||
# fall through to generic jump handling below.
|
||||
potential_name = self.lookup(op_str_address, exact=True)
|
||||
if potential_name is not None:
|
||||
return (inst.mnemonic, potential_name)
|
||||
|
||||
# Else: this is any jump
|
||||
# Show the jump offset rather than the absolute address
|
||||
jump_displacement = op_str_address - (inst.address + inst.size)
|
||||
return (inst.mnemonic, hex(jump_displacement))
|
||||
|
||||
if inst.mnemonic == "call":
|
||||
# Special handling for absolute indirect CALL.
|
||||
op_str = ptr_replace_regex.sub(self.hex_replace_indirect, inst.op_str)
|
||||
else:
|
||||
op_str = ptr_replace_regex.sub(self.hex_replace_always, inst.op_str)
|
||||
|
||||
# We only want relocated addresses for pointer displacement.
|
||||
# i.e. ptr [register + something]
|
||||
# Otherwise we would use a placeholder for every stack variable,
|
||||
# vtable call, or this->member access.
|
||||
op_str = displace_replace_regex.sub(self.hex_replace_relocated, op_str)
|
||||
|
||||
# In the event of pointer comparison, only replace the immediate value
|
||||
# if it is a known address.
|
||||
if inst.mnemonic == "cmp":
|
||||
op_str = immediate_replace_regex.sub(self.hex_replace_annotated, op_str)
|
||||
else:
|
||||
op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str)
|
||||
|
||||
return (inst.mnemonic, op_str)
|
||||
|
||||
def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
|
||||
asm = []
|
||||
|
||||
ig = InstructGen(data, start_addr)
|
||||
|
||||
for sect_type, sect_contents in ig.sections:
|
||||
if sect_type == SectionType.CODE:
|
||||
for inst in sect_contents:
|
||||
# Use heuristics to disregard some differences that aren't representative
|
||||
# of the accuracy of a function (e.g. global offsets)
|
||||
|
||||
# If there is no pointer or immediate value in the op_str,
|
||||
# there is nothing to sanitize.
|
||||
# This leaves us with cases where a small immediate value or
|
||||
# small displacement (this.member or vtable calls) appears.
|
||||
# If we assume that instructions we want to sanitize need to be 5
|
||||
# bytes -- 1 for the opcode and 4 for the address -- exclude cases
|
||||
# where the hex value could not be an address.
|
||||
# The exception is jumps which are as small as 2 bytes
|
||||
# but are still useful to sanitize.
|
||||
if "0x" in inst.op_str and (
|
||||
inst.mnemonic in JUMP_MNEMONICS or inst.size > 4
|
||||
):
|
||||
result = self.sanitize(inst)
|
||||
else:
|
||||
result = (inst.mnemonic, inst.op_str)
|
||||
|
||||
# mnemonic + " " + op_str
|
||||
asm.append((hex(inst.address), " ".join(result)))
|
||||
elif sect_type == SectionType.ADDR_TAB:
|
||||
asm.append(("", "Jump table:"))
|
||||
for i, (ofs, _) in enumerate(sect_contents):
|
||||
asm.append((hex(ofs), f"Jump_dest_{i}"))
|
||||
|
||||
elif sect_type == SectionType.DATA_TAB:
|
||||
asm.append(("", "Data table:"))
|
||||
for ofs, b in sect_contents:
|
||||
asm.append((hex(ofs), hex(b)))
|
||||
|
||||
return asm
|
@ -1,80 +0,0 @@
|
||||
import re
|
||||
|
||||
REGISTER_LIST = set(
|
||||
[
|
||||
"ax",
|
||||
"bp",
|
||||
"bx",
|
||||
"cx",
|
||||
"di",
|
||||
"dx",
|
||||
"eax",
|
||||
"ebp",
|
||||
"ebx",
|
||||
"ecx",
|
||||
"edi",
|
||||
"edx",
|
||||
"esi",
|
||||
"esp",
|
||||
"si",
|
||||
"sp",
|
||||
]
|
||||
)
|
||||
WORDS = re.compile(r"\w+")
|
||||
|
||||
|
||||
def get_registers(line: str):
|
||||
to_replace = []
|
||||
# use words regex to find all matching positions:
|
||||
for match in WORDS.finditer(line):
|
||||
reg = match.group(0)
|
||||
if reg in REGISTER_LIST:
|
||||
to_replace.append((reg, match.start()))
|
||||
return to_replace
|
||||
|
||||
|
||||
def replace_register(
|
||||
lines: list[str], start_line: int, reg: str, replacement: str
|
||||
) -> list[str]:
|
||||
return [
|
||||
line.replace(reg, replacement) if i >= start_line else line
|
||||
for i, line in enumerate(lines)
|
||||
]
|
||||
|
||||
|
||||
# Is it possible to make new_asm the same as original_asm by swapping registers?
|
||||
def can_resolve_register_differences(original_asm, new_asm):
|
||||
# Split the ASM on spaces to get more granularity, and so
|
||||
# that we don't modify the original arrays passed in.
|
||||
original_asm = [part for line in original_asm for part in line.split()]
|
||||
new_asm = [part for line in new_asm for part in line.split()]
|
||||
|
||||
# Swapping ain't gonna help if the lengths are different
|
||||
if len(original_asm) != len(new_asm):
|
||||
return False
|
||||
|
||||
# Look for the mismatching lines
|
||||
for i, original_line in enumerate(original_asm):
|
||||
new_line = new_asm[i]
|
||||
if new_line != original_line:
|
||||
# Find all the registers to replace
|
||||
to_replace = get_registers(original_line)
|
||||
|
||||
for replace in to_replace:
|
||||
(reg, reg_index) = replace
|
||||
replacing_reg = new_line[reg_index : reg_index + len(reg)]
|
||||
if replacing_reg in REGISTER_LIST:
|
||||
if replacing_reg != reg:
|
||||
# Do a three-way swap replacing in all the subsequent lines
|
||||
temp_reg = "&" * len(reg)
|
||||
new_asm = replace_register(new_asm, i, replacing_reg, temp_reg)
|
||||
new_asm = replace_register(new_asm, i, reg, replacing_reg)
|
||||
new_asm = replace_register(new_asm, i, temp_reg, reg)
|
||||
else:
|
||||
# No replacement to do, different code, bail out
|
||||
return False
|
||||
# Check if the lines are now the same
|
||||
for i, original_line in enumerate(original_asm):
|
||||
if new_asm[i] != original_line:
|
||||
return False
|
||||
return True
|
@ -1,921 +0,0 @@
|
||||
import os
|
||||
import logging
|
||||
import difflib
|
||||
import struct
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Iterable, List, Optional
|
||||
from isledecomp.bin import Bin as IsleBin, InvalidVirtualAddressError
|
||||
from isledecomp.cvdump.demangler import demangle_string_const
|
||||
from isledecomp.cvdump import Cvdump, CvdumpAnalysis
|
||||
from isledecomp.cvdump.types import scalar_type_pointer
|
||||
from isledecomp.parser import DecompCodebase
|
||||
from isledecomp.dir import walk_source_dir
|
||||
from isledecomp.types import SymbolType
|
||||
from isledecomp.compare.asm import ParseAsm
|
||||
from isledecomp.compare.asm.fixes import assert_fixup, find_effective_match
|
||||
from .db import CompareDb, MatchInfo
|
||||
from .diff import combined_diff, CombinedDiffOutput
|
||||
from .lines import LinesDb
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiffReport:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
match_type: SymbolType
|
||||
orig_addr: int
|
||||
recomp_addr: int
|
||||
name: str
|
||||
udiff: Optional[CombinedDiffOutput] = None
|
||||
ratio: float = 0.0
|
||||
is_effective_match: bool = False
|
||||
is_stub: bool = False
|
||||
|
||||
@property
|
||||
def effective_ratio(self) -> float:
|
||||
return 1.0 if self.is_effective_match else self.ratio
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""For debug purposes. Proper diff printing (with coloring) is in another module."""
|
||||
return f"{self.name} (0x{self.orig_addr:x}) {self.ratio*100:.02f}%{'*' if self.is_effective_match else ''}"
|
||||
|
||||
|
||||
def create_reloc_lookup(bin_file: IsleBin) -> Callable[[int], bool]:
|
||||
"""Function generator for relocation table lookup"""
|
||||
|
||||
def lookup(addr: int) -> bool:
|
||||
return addr > bin_file.imagebase and bin_file.is_relocated_addr(addr)
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
def create_bin_lookup(bin_file: IsleBin) -> Callable[[int, int], Optional[str]]:
|
||||
"""Function generator for reading from the bin file"""
|
||||
|
||||
def lookup(addr: int, size: int) -> Optional[bytes]:
|
||||
try:
|
||||
return bin_file.read(addr, size)
|
||||
except InvalidVirtualAddressError:
|
||||
return None
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
class Compare:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
def __init__(
|
||||
self, orig_bin: IsleBin, recomp_bin: IsleBin, pdb_file: str, code_dir: str
|
||||
):
|
||||
self.orig_bin = orig_bin
|
||||
self.recomp_bin = recomp_bin
|
||||
self.pdb_file = pdb_file
|
||||
self.code_dir = code_dir
|
||||
# Controls whether we dump the asm output to a file
|
||||
self.debug: bool = False
|
||||
self.runid: str = uuid.uuid4().hex[:8]
|
||||
|
||||
self._lines_db = LinesDb(code_dir)
|
||||
self._db = CompareDb()
|
||||
|
||||
self._load_cvdump()
|
||||
self._load_markers()
|
||||
# Detect floats first to eliminate potential overlap with string data
|
||||
self._find_float_const()
|
||||
self._find_original_strings()
|
||||
self._match_imports()
|
||||
self._match_exports()
|
||||
self._match_thunks()
|
||||
self._find_vtordisp()
|
||||
|
||||
def _load_cvdump(self):
|
||||
logger.info("Parsing %s ...", self.pdb_file)
|
||||
self.cv = (
|
||||
Cvdump(self.pdb_file)
|
||||
.lines()
|
||||
.globals()
|
||||
.publics()
|
||||
.symbols()
|
||||
.section_contributions()
|
||||
.types()
|
||||
.run()
|
||||
)
|
||||
self.cvdump_analysis = CvdumpAnalysis(self.cv)
|
||||
|
||||
for sym in self.cvdump_analysis.nodes:
|
||||
# Skip nodes where we have almost no information.
|
||||
# These probably came from SECTION CONTRIBUTIONS.
|
||||
if sym.name() is None and sym.node_type is None:
|
||||
continue
|
||||
|
||||
# The PDB might contain sections that do not line up with the
|
||||
# actual binary. The symbol "__except_list" is one example.
|
||||
# In these cases, just skip this symbol and move on because
|
||||
# we can't do much with it.
|
||||
if not self.recomp_bin.is_valid_section(sym.section):
|
||||
continue
|
||||
|
||||
addr = self.recomp_bin.get_abs_addr(sym.section, sym.offset)
|
||||
sym.addr = addr
|
||||
|
||||
# If this symbol is the final one in its section, we were not able to
|
||||
# estimate its size because we didn't have the total size of that section.
|
||||
# We can get this estimate now and assume that the final symbol occupies
|
||||
# the remainder of the section.
|
||||
if sym.estimated_size is None:
|
||||
sym.estimated_size = (
|
||||
self.recomp_bin.get_section_extent_by_index(sym.section)
|
||||
- sym.offset
|
||||
)
|
||||
|
||||
if sym.node_type == SymbolType.STRING:
|
||||
string_info = demangle_string_const(sym.decorated_name)
|
||||
if string_info is None:
|
||||
logger.debug(
|
||||
"Could not demangle string symbol: %s", sym.decorated_name
|
||||
)
|
||||
continue
|
||||
|
||||
# TODO: skip unicode for now. will need to handle these differently.
|
||||
if string_info.is_utf16:
|
||||
continue
|
||||
|
||||
raw = self.recomp_bin.read(addr, sym.size())
|
||||
try:
|
||||
# We use the string length reported in the mangled symbol as the
|
||||
# data size, but this is not always accurate with respect to the
|
||||
# null terminator.
|
||||
# e.g. ??_C@_0BA@EFDM@MxObjectFactory?$AA@
|
||||
# reported length: 16 (includes null terminator)
|
||||
# c.f. ??_C@_03DPKJ@enz?$AA@
|
||||
# reported length: 3 (does NOT include terminator)
|
||||
# This will handle the case where the entire string contains "\x00"
|
||||
# because those are distinct from the empty string of length 0.
|
||||
decoded_string = raw.decode("latin1")
|
||||
rstrip_string = decoded_string.rstrip("\x00")
|
||||
|
||||
if decoded_string != "" and rstrip_string != "":
|
||||
sym.friendly_name = rstrip_string
|
||||
else:
|
||||
sym.friendly_name = decoded_string
|
||||
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
self._db.set_recomp_symbol(
|
||||
addr, sym.node_type, sym.name(), sym.decorated_name, sym.size()
|
||||
)
|
||||
|
||||
for (section, offset), (
|
||||
filename,
|
||||
line_no,
|
||||
) in self.cvdump_analysis.verified_lines.items():
|
||||
addr = self.recomp_bin.get_abs_addr(section, offset)
|
||||
self._lines_db.add_line(filename, line_no, addr)
|
||||
|
||||
# The _entry symbol is referenced in the PE header so we get this match for free.
|
||||
self._db.set_function_pair(self.orig_bin.entry, self.recomp_bin.entry)
|
||||
|
||||
def _load_markers(self):
|
||||
# Assume module name is the base filename of the original binary.
|
||||
(module, _) = os.path.splitext(os.path.basename(self.orig_bin.filename))
|
||||
|
||||
codefiles = list(walk_source_dir(self.code_dir))
|
||||
codebase = DecompCodebase(codefiles, module.upper())
|
||||
|
||||
def orig_bin_checker(addr: int) -> bool:
|
||||
return self.orig_bin.is_valid_vaddr(addr)
|
||||
|
||||
# If the address of any annotation would cause an exception,
|
||||
# remove it and report an error.
|
||||
bad_annotations = codebase.prune_invalid_addrs(orig_bin_checker)
|
||||
|
||||
for sym in bad_annotations:
|
||||
logger.error(
|
||||
"Invalid address 0x%x on %s annotation in file: %s",
|
||||
sym.offset,
|
||||
sym.type.name,
|
||||
sym.filename,
|
||||
)
|
||||
|
||||
# Match lineref functions first because this is a guaranteed match.
|
||||
# If we have two functions that share the same name, and one is
|
||||
# a lineref, we can match the nameref correctly because the lineref
|
||||
# was already removed from consideration.
|
||||
for fun in codebase.iter_line_functions():
|
||||
recomp_addr = self._lines_db.search_line(fun.filename, fun.line_number)
|
||||
if recomp_addr is not None:
|
||||
self._db.set_function_pair(fun.offset, recomp_addr)
|
||||
if fun.should_skip():
|
||||
self._db.mark_stub(fun.offset)
|
||||
|
||||
for fun in codebase.iter_name_functions():
|
||||
self._db.match_function(fun.offset, fun.name)
|
||||
if fun.should_skip():
|
||||
self._db.mark_stub(fun.offset)
|
||||
|
||||
for var in codebase.iter_variables():
|
||||
if var.is_static and var.parent_function is not None:
|
||||
self._db.match_static_variable(
|
||||
var.offset, var.name, var.parent_function
|
||||
)
|
||||
else:
|
||||
if self._db.match_variable(var.offset, var.name):
|
||||
self._check_if_array_and_match_elements(var.offset, var.name)
|
||||
|
||||
for tbl in codebase.iter_vtables():
|
||||
self._db.match_vtable(tbl.offset, tbl.name, tbl.base_class)
|
||||
|
||||
for string in codebase.iter_strings():
|
||||
# Not that we don't trust you, but we're checking the string
|
||||
# annotation to make sure it is accurate.
|
||||
try:
|
||||
# TODO: would presumably fail for wchar_t strings
|
||||
orig = self.orig_bin.read_string(string.offset).decode("latin1")
|
||||
string_correct = string.name == orig
|
||||
except UnicodeDecodeError:
|
||||
string_correct = False
|
||||
|
||||
if not string_correct:
|
||||
logger.error(
|
||||
"Data at 0x%x does not match string %s",
|
||||
string.offset,
|
||||
repr(string.name),
|
||||
)
|
||||
continue
|
||||
|
||||
self._db.match_string(string.offset, string.name)
|
||||
|
||||
def _check_if_array_and_match_elements(self, orig_addr: int, name: str):
|
||||
"""
|
||||
Checks if the global variable at `orig_addr` is an array.
|
||||
If yes, adds a match for all its elements. If it is an array of structs, all fields in that struct are also matched.
|
||||
Note that there is no recursion, so an array of arrays would not be handled entirely.
|
||||
This step is necessary e.g. for `0x100f0a20` (LegoRacers.cpp).
|
||||
"""
|
||||
|
||||
def _add_match_in_array(
|
||||
name: str, type_id: str, orig_addr: int, recomp_addr: int
|
||||
):
|
||||
self._db.set_recomp_symbol(
|
||||
recomp_addr,
|
||||
SymbolType.POINTER if scalar_type_pointer(type_id) else SymbolType.DATA,
|
||||
name,
|
||||
name,
|
||||
# we only need the matches when they are referenced elsewhere, hence we don't need the size
|
||||
size=None,
|
||||
)
|
||||
self._db.set_pair(orig_addr, recomp_addr)
|
||||
|
||||
matchinfo = self._db.get_by_orig(orig_addr)
|
||||
if matchinfo is None or matchinfo.recomp_addr is None:
|
||||
return
|
||||
recomp_addr = matchinfo.recomp_addr
|
||||
|
||||
node = next(
|
||||
(x for x in self.cvdump_analysis.nodes if x.addr == recomp_addr),
|
||||
None,
|
||||
)
|
||||
if node is None or node.data_type is None:
|
||||
return
|
||||
|
||||
if not node.data_type.key.startswith("0x"):
|
||||
# scalar type, so clearly not an array
|
||||
return
|
||||
|
||||
data_type = self.cv.types.keys[node.data_type.key.lower()]
|
||||
|
||||
if data_type["type"] == "LF_ARRAY":
|
||||
array_element_type = self.cv.types.get(data_type["array_type"])
|
||||
|
||||
assert node.data_type.members is not None
|
||||
|
||||
for array_element in node.data_type.members:
|
||||
orig_element_base_addr = orig_addr + array_element.offset
|
||||
recomp_element_base_addr = recomp_addr + array_element.offset
|
||||
if array_element_type.members is None:
|
||||
_add_match_in_array(
|
||||
f"{name}{array_element.name}",
|
||||
array_element_type.key,
|
||||
orig_element_base_addr,
|
||||
recomp_element_base_addr,
|
||||
)
|
||||
else:
|
||||
for member in array_element_type.members:
|
||||
_add_match_in_array(
|
||||
f"{name}{array_element.name}.{member.name}",
|
||||
array_element_type.key,
|
||||
orig_element_base_addr + member.offset,
|
||||
recomp_element_base_addr + member.offset,
|
||||
)
|
||||
|
||||
def _find_original_strings(self):
|
||||
"""Go to the original binary and look for the specified string constants
|
||||
to find a match. This is a (relatively) expensive operation so we only
|
||||
look at strings that we have not already matched via a STRING annotation."""
|
||||
# Release builds give each de-duped string a symbol so they are easy to find and match.
|
||||
for string in self._db.get_unmatched_strings():
|
||||
addr = self.orig_bin.find_string(string.encode("latin1"))
|
||||
if addr is None:
|
||||
escaped = repr(string)
|
||||
logger.debug("Failed to find this string in the original: %s", escaped)
|
||||
continue
|
||||
|
||||
self._db.match_string(addr, string)
|
||||
|
||||
def is_real_string(s: str) -> bool:
|
||||
"""Heuristic to ignore values that only look like strings.
|
||||
This is mostly about short strings (len <= 4) that could be byte or word values.
|
||||
"""
|
||||
# 0x10 is the MSB of the address space for DLLs (LEGO1), so this is a pointer
|
||||
if len(s) == 0 or "\x10" in s:
|
||||
return False
|
||||
|
||||
# assert(0) is common
|
||||
if len(s) == 1 and s[0] != "0":
|
||||
return False
|
||||
|
||||
# Hack because str.isprintable() will fail on strings with newlines or tabs
|
||||
if len(s) <= 4 and "\\x" in repr(s):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# Debug builds do not de-dupe the strings, so we need to find them via brute force scan.
|
||||
# We could try to match the string addrs if there is only one in orig and recomp.
|
||||
# When we sanitize the asm, the result is the same regardless.
|
||||
if self.orig_bin.is_debug:
|
||||
for addr, string in self.orig_bin.iter_string("latin1"):
|
||||
if is_real_string(string):
|
||||
self._db.set_orig_symbol(
|
||||
addr, SymbolType.STRING, string, len(string)
|
||||
)
|
||||
|
||||
for addr, string in self.recomp_bin.iter_string("latin1"):
|
||||
if is_real_string(string):
|
||||
self._db.set_recomp_symbol(
|
||||
addr, SymbolType.STRING, string, None, len(string)
|
||||
)
|
||||
|
||||
def _find_float_const(self):
|
||||
"""Add floating point constants in each binary to the database.
|
||||
We are not matching anything right now because these values are not
|
||||
deduped like strings."""
|
||||
for addr, size, float_value in self.orig_bin.find_float_consts():
|
||||
self._db.set_orig_symbol(addr, SymbolType.FLOAT, str(float_value), size)
|
||||
|
||||
for addr, size, float_value in self.recomp_bin.find_float_consts():
|
||||
self._db.set_recomp_symbol(
|
||||
addr, SymbolType.FLOAT, str(float_value), None, size
|
||||
)
|
||||
|
||||
def _match_imports(self):
|
||||
"""We can match imported functions based on the DLL name and
|
||||
function symbol name."""
|
||||
orig_byaddr = {
|
||||
addr: (dll.upper(), name) for (dll, name, addr) in self.orig_bin.imports
|
||||
}
|
||||
recomp_byname = {
|
||||
(dll.upper(), name): addr for (dll, name, addr) in self.recomp_bin.imports
|
||||
}
|
||||
# Combine these two dictionaries. We don't care about imports from recomp
|
||||
# not found in orig because:
|
||||
# 1. They shouldn't be there
|
||||
# 2. They are already identified via cvdump
|
||||
orig_to_recomp = {
|
||||
addr: recomp_byname.get(pair, None) for addr, pair in orig_byaddr.items()
|
||||
}
|
||||
|
||||
# Now: we have the IAT offset in each matched up, so we need to make
|
||||
# the connection between the thunk functions.
|
||||
# We already have the symbol name we need from the PDB.
|
||||
for orig, recomp in orig_to_recomp.items():
|
||||
if orig is None or recomp is None:
|
||||
continue
|
||||
|
||||
# Match the __imp__ symbol
|
||||
self._db.set_pair(orig, recomp, SymbolType.POINTER)
|
||||
|
||||
# Read the relative address from .idata
|
||||
try:
|
||||
(recomp_rva,) = struct.unpack("<L", self.recomp_bin.read(recomp, 4))
|
||||
(orig_rva,) = struct.unpack("<L", self.orig_bin.read(orig, 4))
|
||||
except ValueError:
|
||||
# Bail out if there's a problem with struct.unpack
|
||||
continue
|
||||
|
||||
# Strictly speaking, this is a hack to support asm sanitize.
|
||||
# When calling an import, we will recognize that the address for the
|
||||
# CALL instruction is a pointer to the actual address, but this is
|
||||
# not only not the address of a function, it is not an address at all.
|
||||
# To make the asm display work correctly (i.e. to match what you see
|
||||
# in ghidra) create a function match on the RVA. This is not a valid
|
||||
# virtual address because it is before the imagebase, but it will
|
||||
# do what we need it to do in the sanitize function.
|
||||
|
||||
(dll_name, func_name) = orig_byaddr[orig]
|
||||
fullname = dll_name + ":" + func_name
|
||||
self._db.set_recomp_symbol(
|
||||
recomp_rva, SymbolType.FUNCTION, fullname, None, 4
|
||||
)
|
||||
self._db.set_pair(orig_rva, recomp_rva, SymbolType.FUNCTION)
|
||||
self._db.skip_compare(orig_rva)
|
||||
|
||||
def _match_thunks(self):
|
||||
"""Thunks are (by nature) matched by indirection. If a thunk from orig
|
||||
points at a function we have already matched, we can find the matching
|
||||
thunk in recomp because it points to the same place."""
|
||||
|
||||
# Mark all recomp thunks first. This allows us to use their name
|
||||
# when we sanitize the asm.
|
||||
for recomp_thunk, recomp_addr in self.recomp_bin.thunks:
|
||||
recomp_func = self._db.get_by_recomp(recomp_addr)
|
||||
if recomp_func is None:
|
||||
continue
|
||||
|
||||
self._db.create_recomp_thunk(recomp_thunk, recomp_func.name)
|
||||
|
||||
# Thunks may be non-unique, so use a list as dict value when
|
||||
# inverting the list of tuples from self.recomp_bin.
|
||||
recomp_thunks = {}
|
||||
for thunk_addr, func_addr in self.recomp_bin.thunks:
|
||||
recomp_thunks.setdefault(func_addr, []).append(thunk_addr)
|
||||
|
||||
# Now match the thunks from orig where we can.
|
||||
for orig_thunk, orig_addr in self.orig_bin.thunks:
|
||||
orig_func = self._db.get_by_orig(orig_addr)
|
||||
if orig_func is None:
|
||||
continue
|
||||
|
||||
# Check whether the thunk destination is a matched symbol
|
||||
if orig_func.recomp_addr not in recomp_thunks:
|
||||
self._db.create_orig_thunk(orig_thunk, orig_func.name)
|
||||
continue
|
||||
|
||||
# If there are multiple thunks, they are already in v.addr order.
|
||||
# Pop the earliest one and match it.
|
||||
recomp_thunk = recomp_thunks[orig_func.recomp_addr].pop(0)
|
||||
if len(recomp_thunks[orig_func.recomp_addr]) == 0:
|
||||
del recomp_thunks[orig_func.recomp_addr]
|
||||
|
||||
self._db.set_function_pair(orig_thunk, recomp_thunk)
|
||||
|
||||
# Don't compare thunk functions for now. The comparison isn't
|
||||
# "useful" in the usual sense. We are only looking at the
|
||||
# bytes of the jmp instruction and not the larger context of
|
||||
# where this function is. Also: these will always match 100%
|
||||
# because we are searching for a match to register this as a
|
||||
# function in the first place.
|
||||
self._db.skip_compare(orig_thunk)
|
||||
|
||||
def _match_exports(self):
|
||||
# invert for name lookup
|
||||
orig_exports = {y: x for (x, y) in self.orig_bin.exports}
|
||||
|
||||
for recomp_addr, export_name in self.recomp_bin.exports:
|
||||
orig_addr = orig_exports.get(export_name)
|
||||
if orig_addr is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Check whether either of the addresses is actually a thunk.
|
||||
# This is a quirk of the debug builds. Technically the export
|
||||
# *is* the thunk, but it's more helpful to mark the actual function.
|
||||
# It could be the case that only one side is a thunk, but we can
|
||||
# deal with that.
|
||||
(opcode, rel_addr) = struct.unpack(
|
||||
"<Bl", self.recomp_bin.read(recomp_addr, 5)
|
||||
)
|
||||
if opcode == 0xE9:
|
||||
recomp_addr += 5 + rel_addr
|
||||
|
||||
(opcode, rel_addr) = struct.unpack(
|
||||
"<Bl", self.orig_bin.read(orig_addr, 5)
|
||||
)
|
||||
if opcode == 0xE9:
|
||||
orig_addr += 5 + rel_addr
|
||||
except ValueError:
|
||||
# Bail out if there's a problem with struct.unpack
|
||||
continue
|
||||
|
||||
if self._db.set_pair_tentative(orig_addr, recomp_addr):
|
||||
logger.debug("Matched export %s", repr(export_name))
|
||||
|
||||
def _find_vtordisp(self):
|
||||
"""If there are any cases of virtual inheritance, we can read
|
||||
through the vtables for those classes and find the vtable thunk
|
||||
functions (vtordisp).
|
||||
|
||||
Our approach is this: walk both vtables and check where we have a
|
||||
vtordisp in the recomp table. Inspect the function at that vtable
|
||||
position (in both) and check whether we jump to the same function.
|
||||
|
||||
One potential pitfall here is that the virtual displacement could
|
||||
differ between the thunks. We are not (yet) checking for this, so the
|
||||
result is that the vtable will appear to match but we will have a diff
|
||||
on the thunk in our regular function comparison.
|
||||
|
||||
We could do this differently and check only the original vtable,
|
||||
construct the name of the vtordisp function and match based on that."""
|
||||
|
||||
for match in self._db.get_matches_by_type(SymbolType.VTABLE):
|
||||
assert (
|
||||
match.name is not None
|
||||
and match.orig_addr is not None
|
||||
and match.recomp_addr is not None
|
||||
and match.size is not None
|
||||
)
|
||||
# We need some method of identifying vtables that
|
||||
# might have thunks, and this ought to work okay.
|
||||
if "{for" not in match.name:
|
||||
continue
|
||||
|
||||
next_orig = self._db.get_next_orig_addr(match.orig_addr)
|
||||
assert next_orig is not None
|
||||
orig_upper_size_limit = next_orig - match.orig_addr
|
||||
if orig_upper_size_limit < match.size:
|
||||
# This could happen in debug builds due to code changes between BETA10 and LEGO1,
|
||||
# but we have not seen it yet as of 2024-08-28.
|
||||
logger.warning(
|
||||
"Recomp vtable is larger than orig vtable for %s",
|
||||
match.name,
|
||||
)
|
||||
|
||||
# TODO: We might want to fix this at the source (cvdump) instead.
|
||||
# Any problem will be logged later when we compare the vtable.
|
||||
vtable_size = 4 * (min(match.size, orig_upper_size_limit) // 4)
|
||||
orig_table = self.orig_bin.read(match.orig_addr, vtable_size)
|
||||
recomp_table = self.recomp_bin.read(match.recomp_addr, vtable_size)
|
||||
|
||||
raw_addrs = zip(
|
||||
[t for (t,) in struct.iter_unpack("<L", orig_table)],
|
||||
[t for (t,) in struct.iter_unpack("<L", recomp_table)],
|
||||
)
|
||||
|
||||
# Now walk both vtables looking for thunks.
|
||||
for orig_addr, recomp_addr in raw_addrs:
|
||||
if orig_addr == 0:
|
||||
# This happens in debug builds due to code changes between BETA10 and LEGO1.
|
||||
# Note that there is a risk of running into the next vtable if there is no gap in between,
|
||||
# which we cannot protect against at the moment.
|
||||
logger.warning(
|
||||
"Recomp vtable is larger than orig vtable for %s", match.name
|
||||
)
|
||||
break
|
||||
|
||||
if self._db.is_vtordisp(recomp_addr):
|
||||
self._match_vtordisp_in_vtable(orig_addr, recomp_addr)
|
||||
|
||||
def _match_vtordisp_in_vtable(self, orig_addr, recomp_addr):
|
||||
thunk_fn = self.get_by_recomp(recomp_addr)
|
||||
assert thunk_fn is not None
|
||||
assert thunk_fn.size is not None
|
||||
|
||||
# Read the function bytes here.
|
||||
# In practice, the adjuster thunk will be under 16 bytes.
|
||||
# If we have thunks of unequal size, we can still tell whether they are thunking
|
||||
# the same function by grabbing the JMP instruction at the end.
|
||||
thunk_presumed_size = max(thunk_fn.size, 16)
|
||||
|
||||
# Strip off MSVC padding 0xcc bytes.
|
||||
# This should be safe to do; it is highly unlikely that
|
||||
# the MSB of the jump displacement would be 0xcc. (huge jump)
|
||||
orig_thunk_bin = self.orig_bin.read(orig_addr, thunk_presumed_size).rstrip(
|
||||
b"\xcc"
|
||||
)
|
||||
|
||||
recomp_thunk_bin = self.recomp_bin.read(
|
||||
recomp_addr, thunk_presumed_size
|
||||
).rstrip(b"\xcc")
|
||||
|
||||
# Read jump opcode and displacement (last 5 bytes)
|
||||
(orig_jmp, orig_disp) = struct.unpack("<Bi", orig_thunk_bin[-5:])
|
||||
(recomp_jmp, recomp_disp) = struct.unpack("<Bi", recomp_thunk_bin[-5:])
|
||||
|
||||
# Make sure it's a JMP
|
||||
if orig_jmp != 0xE9 or recomp_jmp != 0xE9:
|
||||
logger.warning(
|
||||
"Not a jump in vtordisp at (0x%x, 0x%x)", orig_addr, recomp_addr
|
||||
)
|
||||
return
|
||||
|
||||
# Calculate jump destination from the end of the JMP instruction
|
||||
# i.e. the end of the function
|
||||
orig_actual = orig_addr + len(orig_thunk_bin) + orig_disp
|
||||
recomp_actual = recomp_addr + len(recomp_thunk_bin) + recomp_disp
|
||||
|
||||
# If they are thunking the same function, then this must be a match.
|
||||
if self.is_pointer_match(orig_actual, recomp_actual):
|
||||
if len(orig_thunk_bin) != len(recomp_thunk_bin):
|
||||
logger.warning(
|
||||
"Adjuster thunk %s (0x%x) is not exact",
|
||||
thunk_fn.name,
|
||||
orig_addr,
|
||||
)
|
||||
self._db.set_function_pair(orig_addr, recomp_addr)
|
||||
|
||||
def _dump_asm(self, orig_combined, recomp_combined):
|
||||
"""Append the provided assembly output to the debug files"""
|
||||
with open(f"orig-{self.runid}.txt", "a", encoding="utf-8") as f:
|
||||
for addr, line in orig_combined:
|
||||
f.write(f"{addr}: {line}\n")
|
||||
|
||||
with open(f"recomp-{self.runid}.txt", "a", encoding="utf-8") as f:
|
||||
for addr, line in recomp_combined:
|
||||
f.write(f"{addr}: {line}\n")
|
||||
|
||||
def _compare_function(self, match: MatchInfo) -> DiffReport:
|
||||
# Detect when the recomp function size would cause us to read
|
||||
# enough bytes from the original function that we cross into
|
||||
# the next annotated function.
|
||||
next_orig = self._db.get_next_orig_addr(match.orig_addr)
|
||||
if next_orig is not None:
|
||||
orig_size = min(next_orig - match.orig_addr, match.size)
|
||||
else:
|
||||
orig_size = match.size
|
||||
|
||||
orig_raw = self.orig_bin.read(match.orig_addr, orig_size)
|
||||
recomp_raw = self.recomp_bin.read(match.recomp_addr, match.size)
|
||||
|
||||
# It's unlikely that a function other than an adjuster thunk would
|
||||
# start with a SUB instruction, so alert to a possible wrong
|
||||
# annotation here.
|
||||
# There's probably a better place to do this, but we're reading
|
||||
# the function bytes here already.
|
||||
try:
|
||||
if orig_raw[0] == 0x2B and recomp_raw[0] != 0x2B:
|
||||
logger.warning(
|
||||
"Possible thunk at 0x%x (%s)", match.orig_addr, match.name
|
||||
)
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
def orig_lookup(addr: int, exact: bool) -> Optional[str]:
|
||||
m = self._db.get_by_orig(addr, exact)
|
||||
if m is None:
|
||||
return None
|
||||
|
||||
if m.orig_addr == addr:
|
||||
return m.match_name()
|
||||
|
||||
offset = addr - m.orig_addr
|
||||
if m.compare_type != SymbolType.DATA or offset >= m.size:
|
||||
return None
|
||||
|
||||
return m.offset_name(offset)
|
||||
|
||||
def recomp_lookup(addr: int, exact: bool) -> Optional[str]:
|
||||
m = self._db.get_by_recomp(addr, exact)
|
||||
if m is None:
|
||||
return None
|
||||
|
||||
if m.recomp_addr == addr:
|
||||
return m.match_name()
|
||||
|
||||
offset = addr - m.recomp_addr
|
||||
if m.compare_type != SymbolType.DATA or offset >= m.size:
|
||||
return None
|
||||
|
||||
return m.offset_name(offset)
|
||||
|
||||
orig_should_replace = create_reloc_lookup(self.orig_bin)
|
||||
recomp_should_replace = create_reloc_lookup(self.recomp_bin)
|
||||
|
||||
orig_bin_lookup = create_bin_lookup(self.orig_bin)
|
||||
recomp_bin_lookup = create_bin_lookup(self.recomp_bin)
|
||||
|
||||
orig_parse = ParseAsm(
|
||||
relocate_lookup=orig_should_replace,
|
||||
name_lookup=orig_lookup,
|
||||
bin_lookup=orig_bin_lookup,
|
||||
)
|
||||
recomp_parse = ParseAsm(
|
||||
relocate_lookup=recomp_should_replace,
|
||||
name_lookup=recomp_lookup,
|
||||
bin_lookup=recomp_bin_lookup,
|
||||
)
|
||||
|
||||
orig_combined = orig_parse.parse_asm(orig_raw, match.orig_addr)
|
||||
recomp_combined = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
|
||||
|
||||
if self.debug:
|
||||
self._dump_asm(orig_combined, recomp_combined)
|
||||
|
||||
# Check for assert calls only if we expect to find them
|
||||
if self.orig_bin.is_debug or self.recomp_bin.is_debug:
|
||||
assert_fixup(orig_combined)
|
||||
assert_fixup(recomp_combined)
|
||||
|
||||
# Detach addresses from asm lines for the text diff.
|
||||
orig_asm = [x[1] for x in orig_combined]
|
||||
recomp_asm = [x[1] for x in recomp_combined]
|
||||
|
||||
diff = difflib.SequenceMatcher(None, orig_asm, recomp_asm, autojunk=False)
|
||||
ratio = diff.ratio()
|
||||
|
||||
if ratio != 1.0:
|
||||
# Check whether we can resolve register swaps which are actually
|
||||
# perfect matches modulo compiler entropy.
|
||||
codes = diff.get_opcodes()
|
||||
is_effective_match = find_effective_match(codes, orig_asm, recomp_asm)
|
||||
unified_diff = combined_diff(
|
||||
diff, orig_combined, recomp_combined, context_size=10
|
||||
)
|
||||
else:
|
||||
is_effective_match = False
|
||||
unified_diff = []
|
||||
|
||||
return DiffReport(
|
||||
match_type=SymbolType.FUNCTION,
|
||||
orig_addr=match.orig_addr,
|
||||
recomp_addr=match.recomp_addr,
|
||||
name=match.name,
|
||||
udiff=unified_diff,
|
||||
ratio=ratio,
|
||||
is_effective_match=is_effective_match,
|
||||
)
|
||||
|
||||
def _compare_vtable(self, match: MatchInfo) -> DiffReport:
|
||||
vtable_size = match.size
|
||||
|
||||
# The vtable size should always be a multiple of 4 because that
|
||||
# is the pointer size. If it is not (for whatever reason)
|
||||
# it would cause iter_unpack to blow up so let's just fix it.
|
||||
if vtable_size % 4 != 0:
|
||||
logger.warning(
|
||||
"Vtable for class %s has irregular size %d", match.name, vtable_size
|
||||
)
|
||||
vtable_size = 4 * (vtable_size // 4)
|
||||
|
||||
orig_table = self.orig_bin.read(match.orig_addr, vtable_size)
|
||||
recomp_table = self.recomp_bin.read(match.recomp_addr, vtable_size)
|
||||
|
||||
raw_addrs = zip(
|
||||
[t for (t,) in struct.iter_unpack("<L", orig_table)],
|
||||
[t for (t,) in struct.iter_unpack("<L", recomp_table)],
|
||||
)
|
||||
|
||||
def match_text(m: Optional[MatchInfo], raw_addr: Optional[int] = None) -> str:
|
||||
"""Format the function reference at this vtable index as text.
|
||||
If we have not identified this function, we have the option to
|
||||
display the raw address. This is only worth doing for the original addr
|
||||
because we should always be able to identify the recomp function.
|
||||
If the original function is missing then this probably means that the class
|
||||
should override the given function from the superclass, but we have not
|
||||
implemented this yet.
|
||||
"""
|
||||
|
||||
if m is not None:
|
||||
orig = hex(m.orig_addr) if m.orig_addr is not None else "no orig"
|
||||
recomp = (
|
||||
hex(m.recomp_addr) if m.recomp_addr is not None else "no recomp"
|
||||
)
|
||||
return f"({orig} / {recomp}) : {m.name}"
|
||||
|
||||
if raw_addr is not None:
|
||||
return f"0x{raw_addr:x} from orig not annotated."
|
||||
|
||||
return "(no match)"
|
||||
|
||||
orig_text = []
|
||||
recomp_text = []
|
||||
ratio = 0
|
||||
n_entries = 0
|
||||
|
||||
# Now compare each pointer from the two vtables.
|
||||
for i, (raw_orig, raw_recomp) in enumerate(raw_addrs):
|
||||
orig = self._db.get_by_orig(raw_orig)
|
||||
recomp = self._db.get_by_recomp(raw_recomp)
|
||||
|
||||
if (
|
||||
orig is not None
|
||||
and recomp is not None
|
||||
and orig.recomp_addr == recomp.recomp_addr
|
||||
):
|
||||
ratio += 1
|
||||
|
||||
n_entries += 1
|
||||
index = f"vtable0x{i*4:02x}"
|
||||
orig_text.append((index, match_text(orig, raw_orig)))
|
||||
recomp_text.append((index, match_text(recomp)))
|
||||
|
||||
ratio = ratio / float(n_entries) if n_entries > 0 else 0
|
||||
|
||||
# n=100: Show the entire table if there is a diff to display.
|
||||
# Otherwise it would be confusing if the table got cut off.
|
||||
|
||||
sm = difflib.SequenceMatcher(
|
||||
None,
|
||||
[x[1] for x in orig_text],
|
||||
[x[1] for x in recomp_text],
|
||||
)
|
||||
|
||||
unified_diff = combined_diff(sm, orig_text, recomp_text, context_size=100)
|
||||
|
||||
return DiffReport(
|
||||
match_type=SymbolType.VTABLE,
|
||||
orig_addr=match.orig_addr,
|
||||
recomp_addr=match.recomp_addr,
|
||||
name=match.name,
|
||||
udiff=unified_diff,
|
||||
ratio=ratio,
|
||||
)
|
||||
|
||||
def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:
|
||||
"""Router for comparison type"""
|
||||
|
||||
if match.size is None or match.size == 0:
|
||||
return None
|
||||
|
||||
options = self._db.get_match_options(match.orig_addr)
|
||||
if options.get("skip", False):
|
||||
return None
|
||||
|
||||
if options.get("stub", False):
|
||||
return DiffReport(
|
||||
match_type=match.compare_type,
|
||||
orig_addr=match.orig_addr,
|
||||
recomp_addr=match.recomp_addr,
|
||||
name=match.name,
|
||||
is_stub=True,
|
||||
)
|
||||
|
||||
if match.compare_type == SymbolType.FUNCTION:
|
||||
return self._compare_function(match)
|
||||
|
||||
if match.compare_type == SymbolType.VTABLE:
|
||||
return self._compare_vtable(match)
|
||||
|
||||
return None
|
||||
|
||||
## Public API
|
||||
|
||||
def is_pointer_match(self, orig_addr, recomp_addr) -> bool:
|
||||
"""Check whether these pointers point at the same thing"""
|
||||
|
||||
# Null pointers considered matching
|
||||
if orig_addr == 0 and recomp_addr == 0:
|
||||
return True
|
||||
|
||||
match = self._db.get_by_orig(orig_addr)
|
||||
if match is None:
|
||||
return False
|
||||
|
||||
return match.recomp_addr == recomp_addr
|
||||
|
||||
def get_by_orig(self, addr: int) -> Optional[MatchInfo]:
|
||||
return self._db.get_by_orig(addr)
|
||||
|
||||
def get_by_recomp(self, addr: int) -> Optional[MatchInfo]:
|
||||
return self._db.get_by_recomp(addr)
|
||||
|
||||
def get_all(self) -> List[MatchInfo]:
|
||||
return self._db.get_all()
|
||||
|
||||
def get_functions(self) -> List[MatchInfo]:
|
||||
return self._db.get_matches_by_type(SymbolType.FUNCTION)
|
||||
|
||||
def get_vtables(self) -> List[MatchInfo]:
|
||||
return self._db.get_matches_by_type(SymbolType.VTABLE)
|
||||
|
||||
def get_variables(self) -> List[MatchInfo]:
|
||||
return self._db.get_matches_by_type(SymbolType.DATA)
|
||||
|
||||
def get_match_options(self, addr: int) -> Optional[dict[str, Any]]:
|
||||
return self._db.get_match_options(addr)
|
||||
|
||||
def compare_address(self, addr: int) -> Optional[DiffReport]:
|
||||
match = self._db.get_one_match(addr)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return self._compare_match(match)
|
||||
|
||||
def compare_all(self) -> Iterable[DiffReport]:
|
||||
for match in self._db.get_matches():
|
||||
diff = self._compare_match(match)
|
||||
if diff is not None:
|
||||
yield diff
|
||||
|
||||
def compare_functions(self) -> Iterable[DiffReport]:
|
||||
for match in self.get_functions():
|
||||
diff = self._compare_match(match)
|
||||
if diff is not None:
|
||||
yield diff
|
||||
|
||||
def compare_variables(self):
|
||||
pass
|
||||
|
||||
def compare_pointers(self):
|
||||
pass
|
||||
|
||||
def compare_strings(self):
|
||||
pass
|
||||
|
||||
def compare_vtables(self) -> Iterable[DiffReport]:
|
||||
for match in self.get_vtables():
|
||||
diff = self._compare_match(match)
|
||||
if diff is not None:
|
||||
yield self._compare_match(match)
|
@ -1,554 +0,0 @@
|
||||
"""Wrapper for database (here an in-memory sqlite database) that collects the
|
||||
addresses/symbols that we want to compare between the original and recompiled binaries."""
|
||||
|
||||
import sqlite3
|
||||
import logging
|
||||
from typing import Any, List, Optional
|
||||
from isledecomp.types import SymbolType
|
||||
from isledecomp.cvdump.demangler import get_vtordisp_name
|
||||
|
||||
_SETUP_SQL = """
|
||||
DROP TABLE IF EXISTS `symbols`;
|
||||
DROP TABLE IF EXISTS `match_options`;
|
||||
|
||||
CREATE TABLE `symbols` (
|
||||
compare_type int,
|
||||
orig_addr int,
|
||||
recomp_addr int,
|
||||
name text,
|
||||
decorated_name text,
|
||||
size int
|
||||
);
|
||||
|
||||
CREATE TABLE `match_options` (
|
||||
addr int not null,
|
||||
name text not null,
|
||||
value text,
|
||||
primary key (addr, name)
|
||||
) without rowid;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS `match_info`
|
||||
(compare_type, orig_addr, recomp_addr, name, size) AS
|
||||
SELECT compare_type, orig_addr, recomp_addr, name, size
|
||||
FROM `symbols`
|
||||
ORDER BY orig_addr NULLS LAST;
|
||||
|
||||
CREATE INDEX `symbols_or` ON `symbols` (orig_addr);
|
||||
CREATE INDEX `symbols_re` ON `symbols` (recomp_addr);
|
||||
CREATE INDEX `symbols_na` ON `symbols` (name);
|
||||
"""
|
||||
|
||||
|
||||
class MatchInfo:
|
||||
def __init__(
|
||||
self,
|
||||
ctype: Optional[int],
|
||||
orig: Optional[int],
|
||||
recomp: Optional[int],
|
||||
name: Optional[str],
|
||||
size: Optional[int],
|
||||
) -> None:
|
||||
self.compare_type = SymbolType(ctype) if ctype is not None else None
|
||||
self.orig_addr = orig
|
||||
self.recomp_addr = recomp
|
||||
self.name = name
|
||||
self.size = size
|
||||
|
||||
def match_name(self) -> Optional[str]:
|
||||
"""Combination of the name and compare type.
|
||||
Intended for name substitution in the diff. If there is a diff,
|
||||
it will be more obvious what this symbol indicates."""
|
||||
if self.name is None:
|
||||
return None
|
||||
|
||||
ctype = self.compare_type.name if self.compare_type is not None else "UNK"
|
||||
name = repr(self.name) if ctype == "STRING" else self.name
|
||||
return f"{name} ({ctype})"
|
||||
|
||||
def offset_name(self, ofs: int) -> Optional[str]:
|
||||
if self.name is None:
|
||||
return None
|
||||
|
||||
return f"{self.name}+{ofs} (OFFSET)"
|
||||
|
||||
|
||||
def matchinfo_factory(_, row):
|
||||
return MatchInfo(*row)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CompareDb:
|
||||
# pylint: disable=too-many-public-methods
|
||||
def __init__(self):
|
||||
self._db = sqlite3.connect(":memory:")
|
||||
self._db.executescript(_SETUP_SQL)
|
||||
|
||||
def set_orig_symbol(
|
||||
self,
|
||||
addr: int,
|
||||
compare_type: Optional[SymbolType],
|
||||
name: Optional[str],
|
||||
size: Optional[int],
|
||||
):
|
||||
# Ignore collisions here.
|
||||
if self._orig_used(addr):
|
||||
return
|
||||
|
||||
compare_value = compare_type.value if compare_type is not None else None
|
||||
self._db.execute(
|
||||
"INSERT INTO `symbols` (orig_addr, compare_type, name, size) VALUES (?,?,?,?)",
|
||||
(addr, compare_value, name, size),
|
||||
)
|
||||
|
||||
def set_recomp_symbol(
|
||||
self,
|
||||
addr: int,
|
||||
compare_type: Optional[SymbolType],
|
||||
name: Optional[str],
|
||||
decorated_name: Optional[str],
|
||||
size: Optional[int],
|
||||
):
|
||||
# Ignore collisions here. The same recomp address can have
|
||||
# multiple names (e.g. _strlwr and __strlwr)
|
||||
if self._recomp_used(addr):
|
||||
return
|
||||
|
||||
compare_value = compare_type.value if compare_type is not None else None
|
||||
self._db.execute(
|
||||
"INSERT INTO `symbols` (recomp_addr, compare_type, name, decorated_name, size) VALUES (?,?,?,?,?)",
|
||||
(addr, compare_value, name, decorated_name, size),
|
||||
)
|
||||
|
||||
def get_unmatched_strings(self) -> List[str]:
|
||||
"""Return any strings not already identified by STRING markers."""
|
||||
|
||||
cur = self._db.execute(
|
||||
"SELECT name FROM `symbols` WHERE compare_type = ? AND orig_addr IS NULL",
|
||||
(SymbolType.STRING.value,),
|
||||
)
|
||||
|
||||
return [string for (string,) in cur.fetchall()]
|
||||
|
||||
def get_all(self) -> List[MatchInfo]:
|
||||
cur = self._db.execute("SELECT * FROM `match_info`")
|
||||
cur.row_factory = matchinfo_factory
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
def get_matches(self) -> Optional[MatchInfo]:
|
||||
cur = self._db.execute(
|
||||
"""SELECT * FROM `match_info`
|
||||
WHERE orig_addr IS NOT NULL
|
||||
AND recomp_addr IS NOT NULL
|
||||
""",
|
||||
)
|
||||
cur.row_factory = matchinfo_factory
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
def get_one_match(self, addr: int) -> Optional[MatchInfo]:
|
||||
cur = self._db.execute(
|
||||
"""SELECT * FROM `match_info`
|
||||
WHERE orig_addr = ?
|
||||
AND recomp_addr IS NOT NULL
|
||||
""",
|
||||
(addr,),
|
||||
)
|
||||
cur.row_factory = matchinfo_factory
|
||||
return cur.fetchone()
|
||||
|
||||
def _get_closest_orig(self, addr: int) -> Optional[int]:
|
||||
value = self._db.execute(
|
||||
"""SELECT max(orig_addr) FROM `symbols`
|
||||
WHERE ? >= orig_addr
|
||||
LIMIT 1
|
||||
""",
|
||||
(addr,),
|
||||
).fetchone()
|
||||
return value[0] if value is not None else None
|
||||
|
||||
def _get_closest_recomp(self, addr: int) -> Optional[int]:
|
||||
value = self._db.execute(
|
||||
"""SELECT max(recomp_addr) FROM `symbols`
|
||||
WHERE ? >= recomp_addr
|
||||
LIMIT 1
|
||||
""",
|
||||
(addr,),
|
||||
).fetchone()
|
||||
return value[0] if value is not None else None
|
||||
|
||||
def get_by_orig(self, addr: int, exact: bool = True) -> Optional[MatchInfo]:
|
||||
if not exact and not self._orig_used(addr):
|
||||
addr = self._get_closest_orig(addr)
|
||||
if addr is None:
|
||||
return None
|
||||
|
||||
cur = self._db.execute(
|
||||
"""SELECT * FROM `match_info`
|
||||
WHERE orig_addr = ?
|
||||
""",
|
||||
(addr,),
|
||||
)
|
||||
cur.row_factory = matchinfo_factory
|
||||
return cur.fetchone()
|
||||
|
||||
def get_by_recomp(self, addr: int, exact: bool = True) -> Optional[MatchInfo]:
|
||||
if not exact and not self._recomp_used(addr):
|
||||
addr = self._get_closest_recomp(addr)
|
||||
if addr is None:
|
||||
return None
|
||||
|
||||
cur = self._db.execute(
|
||||
"""SELECT * FROM `match_info`
|
||||
WHERE recomp_addr = ?
|
||||
""",
|
||||
(addr,),
|
||||
)
|
||||
cur.row_factory = matchinfo_factory
|
||||
return cur.fetchone()
|
||||
|
||||
def get_matches_by_type(self, compare_type: SymbolType) -> List[MatchInfo]:
|
||||
cur = self._db.execute(
|
||||
"""SELECT * FROM `match_info`
|
||||
WHERE compare_type = ?
|
||||
AND orig_addr IS NOT NULL
|
||||
AND recomp_addr IS NOT NULL
|
||||
""",
|
||||
(compare_type.value,),
|
||||
)
|
||||
cur.row_factory = matchinfo_factory
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
def _orig_used(self, addr: int) -> bool:
|
||||
cur = self._db.execute("SELECT 1 FROM symbols WHERE orig_addr = ?", (addr,))
|
||||
return cur.fetchone() is not None
|
||||
|
||||
def _recomp_used(self, addr: int) -> bool:
|
||||
cur = self._db.execute("SELECT 1 FROM symbols WHERE recomp_addr = ?", (addr,))
|
||||
return cur.fetchone() is not None
|
||||
|
||||
def set_pair(
|
||||
self, orig: int, recomp: int, compare_type: Optional[SymbolType] = None
|
||||
) -> bool:
|
||||
if self._orig_used(orig):
|
||||
logger.debug("Original address %s not unique!", hex(orig))
|
||||
return False
|
||||
|
||||
compare_value = compare_type.value if compare_type is not None else None
|
||||
cur = self._db.execute(
|
||||
"UPDATE `symbols` SET orig_addr = ?, compare_type = ? WHERE recomp_addr = ?",
|
||||
(orig, compare_value, recomp),
|
||||
)
|
||||
|
||||
return cur.rowcount > 0
|
||||
|
||||
def set_pair_tentative(
|
||||
self, orig: int, recomp: int, compare_type: Optional[SymbolType] = None
|
||||
) -> bool:
|
||||
"""Declare a match for the original and recomp addresses given, but only if:
|
||||
1. The original address is not used elsewhere (as with set_pair)
|
||||
2. The recomp address has not already been matched
|
||||
If the compare_type is given, update this also, but only if NULL in the db.
|
||||
|
||||
The purpose here is to set matches found via some automated analysis
|
||||
but to not overwrite a match provided by the human operator."""
|
||||
if self._orig_used(orig):
|
||||
# Probable and expected situation. Just ignore it.
|
||||
return False
|
||||
|
||||
compare_value = compare_type.value if compare_type is not None else None
|
||||
|
||||
cur = self._db.execute(
|
||||
"""UPDATE `symbols`
|
||||
SET orig_addr = ?, compare_type = coalesce(compare_type, ?)
|
||||
WHERE recomp_addr = ?
|
||||
AND orig_addr IS NULL""",
|
||||
(orig, compare_value, recomp),
|
||||
)
|
||||
|
||||
return cur.rowcount > 0
|
||||
|
||||
def set_function_pair(self, orig: int, recomp: int) -> bool:
|
||||
"""For lineref match or _entry"""
|
||||
return self.set_pair(orig, recomp, SymbolType.FUNCTION)
|
||||
|
||||
def create_orig_thunk(self, addr: int, name: str) -> bool:
|
||||
"""Create a thunk function reference using the orig address.
|
||||
We are here because we have a match on the thunked function,
|
||||
but it is not thunked in the recomp build."""
|
||||
|
||||
if self._orig_used(addr):
|
||||
return False
|
||||
|
||||
thunk_name = f"Thunk of '{name}'"
|
||||
|
||||
# Assuming relative jump instruction for thunks (5 bytes)
|
||||
cur = self._db.execute(
|
||||
"""INSERT INTO `symbols`
|
||||
(orig_addr, compare_type, name, size)
|
||||
VALUES (?,?,?,?)""",
|
||||
(addr, SymbolType.FUNCTION.value, thunk_name, 5),
|
||||
)
|
||||
|
||||
return cur.rowcount > 0
|
||||
|
||||
def create_recomp_thunk(self, addr: int, name: str) -> bool:
|
||||
"""Create a thunk function reference using the recomp address.
|
||||
We start from the recomp side for this because we are guaranteed
|
||||
to have full information from the PDB. We can use a regular function
|
||||
match later to pull in the orig address."""
|
||||
|
||||
if self._recomp_used(addr):
|
||||
return False
|
||||
|
||||
thunk_name = f"Thunk of '{name}'"
|
||||
|
||||
# Assuming relative jump instruction for thunks (5 bytes)
|
||||
cur = self._db.execute(
|
||||
"""INSERT INTO `symbols`
|
||||
(recomp_addr, compare_type, name, size)
|
||||
VALUES (?,?,?,?)""",
|
||||
(addr, SymbolType.FUNCTION.value, thunk_name, 5),
|
||||
)
|
||||
|
||||
return cur.rowcount > 0
|
||||
|
||||
def _set_opt_bool(self, addr: int, option: str, enabled: bool = True):
|
||||
if enabled:
|
||||
self._db.execute(
|
||||
"""INSERT OR IGNORE INTO `match_options`
|
||||
(addr, name)
|
||||
VALUES (?, ?)""",
|
||||
(addr, option),
|
||||
)
|
||||
else:
|
||||
self._db.execute(
|
||||
"""DELETE FROM `match_options` WHERE addr = ? AND name = ?""",
|
||||
(addr, option),
|
||||
)
|
||||
|
||||
def mark_stub(self, orig: int):
|
||||
self._set_opt_bool(orig, "stub")
|
||||
|
||||
def skip_compare(self, orig: int):
|
||||
self._set_opt_bool(orig, "skip")
|
||||
|
||||
def get_match_options(self, addr: int) -> Optional[dict[str, Any]]:
|
||||
cur = self._db.execute(
|
||||
"""SELECT name, value FROM `match_options` WHERE addr = ?""", (addr,)
|
||||
)
|
||||
|
||||
return {
|
||||
option: value if value is not None else True
|
||||
for (option, value) in cur.fetchall()
|
||||
}
|
||||
|
||||
def is_vtordisp(self, recomp_addr: int) -> bool:
|
||||
"""Check whether this function is a vtordisp based on its
|
||||
decorated name. If its demangled name is missing the vtordisp
|
||||
indicator, correct that."""
|
||||
row = self._db.execute(
|
||||
"""SELECT name, decorated_name
|
||||
FROM `symbols`
|
||||
WHERE recomp_addr = ?""",
|
||||
(recomp_addr,),
|
||||
).fetchone()
|
||||
|
||||
if row is None:
|
||||
return False
|
||||
|
||||
(name, decorated_name) = row
|
||||
if "`vtordisp" in name:
|
||||
return True
|
||||
|
||||
if decorated_name is None:
|
||||
# happens in debug builds, e.g. for "Thunk of 'LegoAnimActor::ClassName'"
|
||||
return False
|
||||
|
||||
new_name = get_vtordisp_name(decorated_name)
|
||||
if new_name is None:
|
||||
return False
|
||||
|
||||
self._db.execute(
|
||||
"""UPDATE `symbols`
|
||||
SET name = ?
|
||||
WHERE recomp_addr = ?""",
|
||||
(new_name, recomp_addr),
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _find_potential_match(
|
||||
self, name: str, compare_type: SymbolType
|
||||
) -> Optional[int]:
|
||||
"""Name lookup"""
|
||||
match_decorate = compare_type != SymbolType.STRING and name.startswith("?")
|
||||
if match_decorate:
|
||||
sql = """
|
||||
SELECT recomp_addr
|
||||
FROM `symbols`
|
||||
WHERE orig_addr IS NULL
|
||||
AND decorated_name = ?
|
||||
AND (compare_type IS NULL OR compare_type = ?)
|
||||
LIMIT 1
|
||||
"""
|
||||
else:
|
||||
sql = """
|
||||
SELECT recomp_addr
|
||||
FROM `symbols`
|
||||
WHERE orig_addr IS NULL
|
||||
AND name = ?
|
||||
AND (compare_type IS NULL OR compare_type = ?)
|
||||
LIMIT 1
|
||||
"""
|
||||
|
||||
row = self._db.execute(sql, (name, compare_type.value)).fetchone()
|
||||
return row[0] if row is not None else None
|
||||
|
||||
def _find_static_variable(
|
||||
self, variable_name: str, function_sym: str
|
||||
) -> Optional[int]:
|
||||
"""Get the recomp address of a static function variable.
|
||||
Matches using a LIKE clause on the combination of:
|
||||
1. The variable name read from decomp marker.
|
||||
2. The decorated name of the enclosing function.
|
||||
For example, the variable "g_startupDelay" from function "IsleApp::Tick"
|
||||
has symbol: `?g_startupDelay@?1??Tick@IsleApp@@QAEXH@Z@4HA`
|
||||
The function's decorated name is: `?Tick@IsleApp@@QAEXH@Z`"""
|
||||
|
||||
row = self._db.execute(
|
||||
"""SELECT recomp_addr FROM `symbols`
|
||||
WHERE decorated_name LIKE '%' || ? || '%' || ? || '%'
|
||||
AND orig_addr IS NULL
|
||||
AND (compare_type = ? OR compare_type = ? OR compare_type IS NULL)""",
|
||||
(
|
||||
variable_name,
|
||||
function_sym,
|
||||
SymbolType.DATA.value,
|
||||
SymbolType.POINTER.value,
|
||||
),
|
||||
).fetchone()
|
||||
return row[0] if row is not None else None
|
||||
|
||||
def _match_on(self, compare_type: SymbolType, addr: int, name: str) -> bool:
|
||||
# Update the compare_type here too since the marker tells us what we should do
|
||||
|
||||
# Truncate the name to 255 characters. It will not be possible to match a name
|
||||
# longer than that because MSVC truncates the debug symbols to this length.
|
||||
# See also: warning C4786.
|
||||
name = name[:255]
|
||||
|
||||
logger.debug("Looking for %s %s", compare_type.name.lower(), name)
|
||||
recomp_addr = self._find_potential_match(name, compare_type)
|
||||
if recomp_addr is None:
|
||||
return False
|
||||
|
||||
return self.set_pair(addr, recomp_addr, compare_type)
|
||||
|
||||
def get_next_orig_addr(self, addr: int) -> Optional[int]:
|
||||
"""Return the original address (matched or not) that follows
|
||||
the one given. If our recomp function size would cause us to read
|
||||
too many bytes for the original function, we can adjust it."""
|
||||
result = self._db.execute(
|
||||
"""SELECT orig_addr
|
||||
FROM `symbols`
|
||||
WHERE orig_addr > ?
|
||||
ORDER BY orig_addr
|
||||
LIMIT 1""",
|
||||
(addr,),
|
||||
).fetchone()
|
||||
|
||||
return result[0] if result is not None else None
|
||||
|
||||
def match_function(self, addr: int, name: str) -> bool:
|
||||
did_match = self._match_on(SymbolType.FUNCTION, addr, name)
|
||||
if not did_match:
|
||||
logger.error("Failed to find function symbol with name: %s", name)
|
||||
|
||||
return did_match
|
||||
|
||||
def match_vtable(
|
||||
self, addr: int, name: str, base_class: Optional[str] = None
|
||||
) -> bool:
|
||||
# Set up our potential match names
|
||||
bare_vftable = f"{name}::`vftable'"
|
||||
for_name = base_class if base_class is not None else name
|
||||
for_vftable = f"{name}::`vftable'{{for `{for_name}'}}"
|
||||
|
||||
# Only allow a match against "Class:`vftable'"
|
||||
# if this is the derived class.
|
||||
if base_class is None or base_class == name:
|
||||
name_options = (for_vftable, bare_vftable)
|
||||
else:
|
||||
name_options = (for_vftable, for_vftable)
|
||||
|
||||
row = self._db.execute(
|
||||
"""
|
||||
SELECT recomp_addr
|
||||
FROM `symbols`
|
||||
WHERE orig_addr IS NULL
|
||||
AND (name = ? OR name = ?)
|
||||
AND (compare_type = ?)
|
||||
LIMIT 1
|
||||
""",
|
||||
(*name_options, SymbolType.VTABLE.value),
|
||||
).fetchone()
|
||||
|
||||
if row is not None and self.set_pair(addr, row[0], SymbolType.VTABLE):
|
||||
return True
|
||||
|
||||
logger.error("Failed to find vtable for class: %s", name)
|
||||
return False
|
||||
|
||||
def match_static_variable(self, addr: int, name: str, function_addr: int) -> bool:
|
||||
"""Matching a static function variable by combining the variable name
|
||||
with the decorated (mangled) name of its parent function."""
|
||||
|
||||
cur = self._db.execute(
|
||||
"""SELECT name, decorated_name
|
||||
FROM `symbols`
|
||||
WHERE orig_addr = ?""",
|
||||
(function_addr,),
|
||||
)
|
||||
|
||||
if (result := cur.fetchone()) is None:
|
||||
logger.error("No function for static variable: %s", name)
|
||||
return False
|
||||
|
||||
# Get the friendly name for the "failed to match" error message
|
||||
(function_name, decorated_name) = result
|
||||
|
||||
recomp_addr = self._find_static_variable(name, decorated_name)
|
||||
if recomp_addr is not None:
|
||||
# TODO: This variable could be a pointer, but I don't think we
|
||||
# have a way to tell that right now.
|
||||
if self.set_pair(addr, recomp_addr, SymbolType.DATA):
|
||||
return True
|
||||
|
||||
logger.error(
|
||||
"Failed to match static variable %s from function %s",
|
||||
name,
|
||||
function_name,
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
def match_variable(self, addr: int, name: str) -> bool:
|
||||
did_match = self._match_on(SymbolType.DATA, addr, name) or self._match_on(
|
||||
SymbolType.POINTER, addr, name
|
||||
)
|
||||
if not did_match:
|
||||
logger.error("Failed to find variable: %s", name)
|
||||
|
||||
return did_match
|
||||
|
||||
def match_string(self, addr: int, value: str) -> bool:
|
||||
did_match = self._match_on(SymbolType.STRING, addr, value)
|
||||
if not did_match:
|
||||
escaped = repr(value)
|
||||
logger.error("Failed to find string: %s", escaped)
|
||||
|
||||
return did_match
|
@ -1,104 +0,0 @@
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
CombinedDiffInput = List[Tuple[str, str]]
|
||||
# from inner to outer:
|
||||
# Tuple[str, ...]: either (orig_addr, instruction, recomp_addr) or (addr, instruction)
|
||||
# List[...]: a contiguous block of instructions, all matching or all mismatching
|
||||
# Dict[...]: either {"both": List[...]} or {"orig": [...], "recomp": [...]}
|
||||
# Tuple[str, List[...]]: One contiguous part of the diff (without skipping matching code)
|
||||
# List[...]: The list of all the contiguous diffs of a given function
|
||||
CombinedDiffOutput = List[Tuple[str, List[Dict[str, List[Tuple[str, ...]]]]]]
|
||||
|
||||
|
||||
def combined_diff(
|
||||
diff: SequenceMatcher,
|
||||
orig_combined: CombinedDiffInput,
|
||||
recomp_combined: CombinedDiffInput,
|
||||
context_size: int = 3,
|
||||
) -> CombinedDiffOutput:
|
||||
"""We want to diff the original and recomp assembly. The "combined" assembly
|
||||
input has two components: the address of the instruction and the assembly text.
|
||||
We have already diffed the text only. This is the SequenceMatcher object.
|
||||
The SequenceMatcher can generate "opcodes" that describe how to turn "Text A"
|
||||
into "Text B". These refer to list indices of the original arrays, so we can
|
||||
use those to create the final diff and include the address for each line of assembly.
|
||||
This is almost the same procedure as the difflib.unified_diff function, but we
|
||||
are reusing the already generated SequenceMatcher object.
|
||||
"""
|
||||
|
||||
unified_diff = []
|
||||
|
||||
for group in diff.get_grouped_opcodes(context_size):
|
||||
subgroups = []
|
||||
|
||||
# Keep track of the addresses we've seen in this diff group.
|
||||
# This helps create the "@@" line. (Does this have a name?)
|
||||
# Do it this way because not every line in each list will have an
|
||||
# address. If our context begins or ends on a line that does not
|
||||
# have one, we will have an incomplete range string.
|
||||
orig_addrs = set()
|
||||
recomp_addrs = set()
|
||||
|
||||
first, last = group[0], group[-1]
|
||||
orig_range = len(orig_combined[first[1] : last[2]])
|
||||
recomp_range = len(recomp_combined[first[3] : last[4]])
|
||||
|
||||
for code, i1, i2, j1, j2 in group:
|
||||
if code == "equal":
|
||||
# The sections are equal, so the list slices are guaranteed
|
||||
# to have the same length. We only need the diffed value (asm text)
|
||||
# from one of the lists, but we need the addresses from both.
|
||||
# Use zip to put the two lists together and then take out what we want.
|
||||
both = [
|
||||
(a, b, c)
|
||||
for ((a, b), (c, _)) in zip(
|
||||
orig_combined[i1:i2], recomp_combined[j1:j2]
|
||||
)
|
||||
]
|
||||
|
||||
for orig_addr, _, recomp_addr in both:
|
||||
if orig_addr is not None:
|
||||
orig_addrs.add(orig_addr)
|
||||
|
||||
if recomp_addr is not None:
|
||||
recomp_addrs.add(recomp_addr)
|
||||
|
||||
subgroups.append({"both": both})
|
||||
else:
|
||||
for orig_addr, _ in orig_combined[i1:i2]:
|
||||
if orig_addr is not None:
|
||||
orig_addrs.add(orig_addr)
|
||||
|
||||
for recomp_addr, _ in recomp_combined[j1:j2]:
|
||||
if recomp_addr is not None:
|
||||
recomp_addrs.add(recomp_addr)
|
||||
|
||||
subgroups.append(
|
||||
{
|
||||
"orig": orig_combined[i1:i2],
|
||||
"recomp": recomp_combined[j1:j2],
|
||||
}
|
||||
)
|
||||
|
||||
orig_sorted = sorted(orig_addrs)
|
||||
recomp_sorted = sorted(recomp_addrs)
|
||||
|
||||
# We could get a diff group that has no original addresses.
|
||||
# This might happen for a stub function where we are not able to
|
||||
# produce even a single instruction from the original.
|
||||
# In that case, show the best slug line that we can.
|
||||
def peek_front(list_, default=""):
|
||||
try:
|
||||
return list_[0]
|
||||
except IndexError:
|
||||
return default
|
||||
|
||||
orig_first = peek_front(orig_sorted)
|
||||
recomp_first = peek_front(recomp_sorted)
|
||||
|
||||
diff_slug = f"@@ -{orig_first},{orig_range} +{recomp_first},{recomp_range} @@"
|
||||
|
||||
unified_diff.append((diff_slug, subgroups))
|
||||
|
||||
return unified_diff
|
@ -1,69 +0,0 @@
|
||||
"""Database used to match (filename, line_number) pairs
|
||||
between FUNCTION markers and PDB analysis."""
|
||||
import sqlite3
|
||||
import logging
|
||||
from functools import cache
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from isledecomp.dir import PathResolver
|
||||
|
||||
|
||||
_SETUP_SQL = """
|
||||
DROP TABLE IF EXISTS `lineref`;
|
||||
CREATE TABLE `lineref` (
|
||||
path text not null,
|
||||
filename text not null,
|
||||
line int not null,
|
||||
addr int not null
|
||||
);
|
||||
CREATE INDEX `file_line` ON `lineref` (filename, line);
|
||||
"""
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@cache
|
||||
def my_samefile(path: str, source_path: str) -> bool:
|
||||
return Path(path).samefile(source_path)
|
||||
|
||||
|
||||
@cache
|
||||
def my_basename_lower(path: str) -> str:
|
||||
return Path(path).name.lower()
|
||||
|
||||
|
||||
class LinesDb:
|
||||
def __init__(self, code_dir) -> None:
|
||||
self._db = sqlite3.connect(":memory:")
|
||||
self._db.executescript(_SETUP_SQL)
|
||||
self._path_resolver = PathResolver(code_dir)
|
||||
|
||||
def add_line(self, path: str, line_no: int, addr: int):
|
||||
"""To be added from the LINES section of cvdump."""
|
||||
sourcepath = self._path_resolver.resolve_cvdump(path)
|
||||
filename = my_basename_lower(sourcepath)
|
||||
|
||||
self._db.execute(
|
||||
"INSERT INTO `lineref` (path, filename, line, addr) VALUES (?,?,?,?)",
|
||||
(sourcepath, filename, line_no, addr),
|
||||
)
|
||||
|
||||
def search_line(self, path: str, line_no: int) -> Optional[int]:
|
||||
"""Using path and line number from FUNCTION marker,
|
||||
get the address of this function in the recomp."""
|
||||
filename = my_basename_lower(path)
|
||||
cur = self._db.execute(
|
||||
"SELECT path, addr FROM `lineref` WHERE filename = ? AND line = ?",
|
||||
(filename, line_no),
|
||||
)
|
||||
for source_path, addr in cur.fetchall():
|
||||
if my_samefile(path, source_path):
|
||||
return addr
|
||||
|
||||
logger.error(
|
||||
"Failed to find function symbol with filename and line: %s:%d",
|
||||
path,
|
||||
line_no,
|
||||
)
|
||||
return None
|
@ -1,5 +0,0 @@
|
||||
from .symbols import SymbolsEntry
|
||||
from .analysis import CvdumpAnalysis
|
||||
from .parser import CvdumpParser
|
||||
from .runner import Cvdump
|
||||
from .types import CvdumpTypesParser
|
@ -1,187 +0,0 @@
|
||||
"""For collating the results from parsing cvdump.exe into a more directly useful format."""
|
||||
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from isledecomp.cvdump import SymbolsEntry
|
||||
from isledecomp.types import SymbolType
|
||||
from .parser import CvdumpParser
|
||||
from .demangler import demangle_string_const, demangle_vtable
|
||||
from .types import CvdumpKeyError, CvdumpIntegrityError, TypeInfo
|
||||
|
||||
|
||||
class CvdumpNode:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
# These two are required and allow us to identify the symbol
|
||||
section: int
|
||||
offset: int
|
||||
# aka the mangled name from the PUBLICS section
|
||||
decorated_name: Optional[str] = None
|
||||
# optional "nicer" name (e.g. of a function from SYMBOLS section)
|
||||
friendly_name: Optional[str] = None
|
||||
# To be determined by context after inserting data, unless the decorated
|
||||
# name makes this obvious. (i.e. string constants or vtables)
|
||||
# We choose not to assume that section 1 (probably ".text") contains only
|
||||
# functions. Smacker functions are linked to their own section "_UNSTEXT"
|
||||
node_type: Optional[SymbolType] = None
|
||||
# Function size can be read from the LINES section so use this over any
|
||||
# other value if we have it.
|
||||
# TYPES section can tell us the size of structs and other complex types.
|
||||
confirmed_size: Optional[int] = None
|
||||
# Estimated by reading the distance between this symbol and the one that
|
||||
# follows in the same section.
|
||||
# If this is the last symbol in the section, we cannot estimate a size.
|
||||
estimated_size: Optional[int] = None
|
||||
# Size as reported by SECTION CONTRIBUTIONS section. Not guaranteed to be
|
||||
# accurate.
|
||||
section_contribution: Optional[int] = None
|
||||
addr: Optional[int] = None
|
||||
symbol_entry: Optional[SymbolsEntry] = None
|
||||
# Preliminary - only used for non-static variables at the moment
|
||||
data_type: Optional[TypeInfo] = None
|
||||
|
||||
def __init__(self, section: int, offset: int) -> None:
|
||||
self.section = section
|
||||
self.offset = offset
|
||||
|
||||
def set_decorated(self, name: str):
|
||||
self.decorated_name = name
|
||||
|
||||
if self.decorated_name.startswith("??_7"):
|
||||
self.node_type = SymbolType.VTABLE
|
||||
self.friendly_name = demangle_vtable(self.decorated_name)
|
||||
|
||||
elif self.decorated_name.startswith("??_8"):
|
||||
# This is the `vbtable' symbol for virtual inheritance.
|
||||
# Should be okay to reuse demangle_vtable. We still want to
|
||||
# remove things like "const" from the output.
|
||||
self.node_type = SymbolType.DATA
|
||||
self.friendly_name = demangle_vtable(self.decorated_name)
|
||||
|
||||
elif self.decorated_name.startswith("??_C@"):
|
||||
self.node_type = SymbolType.STRING
|
||||
(strlen, _) = demangle_string_const(self.decorated_name)
|
||||
self.confirmed_size = strlen
|
||||
|
||||
elif not self.decorated_name.startswith("?") and "@" in self.decorated_name:
|
||||
# C mangled symbol. The trailing at-sign with number tells the number of bytes
|
||||
# in the parameter list for __stdcall, __fastcall, or __vectorcall
|
||||
# For __cdecl it is more ambiguous and we would have to know which section we are in.
|
||||
# https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170#FormatC
|
||||
self.node_type = SymbolType.FUNCTION
|
||||
|
||||
def name(self) -> Optional[str]:
|
||||
"""Prefer "friendly" name if we have it.
|
||||
This is what we have been using to match functions."""
|
||||
return (
|
||||
self.friendly_name
|
||||
if self.friendly_name is not None
|
||||
else self.decorated_name
|
||||
)
|
||||
|
||||
def size(self) -> Optional[int]:
|
||||
if self.confirmed_size is not None:
|
||||
return self.confirmed_size
|
||||
|
||||
# Better to undershoot the size because we can identify a comparison gap easily
|
||||
if self.estimated_size is not None and self.section_contribution is not None:
|
||||
return min(self.estimated_size, self.section_contribution)
|
||||
|
||||
# Return whichever one we have, or neither
|
||||
return self.estimated_size or self.section_contribution
|
||||
|
||||
|
||||
class CvdumpAnalysis:
|
||||
"""Collects the results from CvdumpParser into a list of nodes (i.e. symbols).
|
||||
These can then be analyzed by a downstream tool."""
|
||||
|
||||
verified_lines: Dict[Tuple[str, str], Tuple[str, str]]
|
||||
|
||||
def __init__(self, parser: CvdumpParser):
|
||||
"""Read in as much information as we have from the parser.
|
||||
The more sections we have, the better our information will be."""
|
||||
node_dict: Dict[Tuple[int, int], CvdumpNode] = {}
|
||||
|
||||
# PUBLICS is our roadmap for everything that follows.
|
||||
for pub in parser.publics:
|
||||
key = (pub.section, pub.offset)
|
||||
if key not in node_dict:
|
||||
node_dict[key] = CvdumpNode(*key)
|
||||
|
||||
node_dict[key].set_decorated(pub.name)
|
||||
|
||||
for sizeref in parser.sizerefs:
|
||||
key = (sizeref.section, sizeref.offset)
|
||||
if key not in node_dict:
|
||||
node_dict[key] = CvdumpNode(*key)
|
||||
|
||||
node_dict[key].section_contribution = sizeref.size
|
||||
|
||||
for glo in parser.globals:
|
||||
key = (glo.section, glo.offset)
|
||||
if key not in node_dict:
|
||||
node_dict[key] = CvdumpNode(*key)
|
||||
|
||||
node_dict[key].node_type = SymbolType.DATA
|
||||
node_dict[key].friendly_name = glo.name
|
||||
|
||||
try:
|
||||
# Check our types database for type information.
|
||||
# If we did not parse the TYPES section, we can only
|
||||
# get information for built-in "T_" types.
|
||||
g_info = parser.types.get(glo.type)
|
||||
node_dict[key].confirmed_size = g_info.size
|
||||
node_dict[key].data_type = g_info
|
||||
# Previously we set the symbol type to POINTER here if
|
||||
# the variable was known to be a pointer. We can derive this
|
||||
# information later when it's time to compare the variable,
|
||||
# so let's set these to symbol type DATA instead.
|
||||
# POINTER will be reserved for non-variable pointer data.
|
||||
# e.g. thunks, unwind section.
|
||||
except (CvdumpKeyError, CvdumpIntegrityError):
|
||||
# No big deal if we don't have complete type information.
|
||||
pass
|
||||
|
||||
for key, _ in parser.lines.items():
|
||||
# Here we only set if the section:offset already exists
|
||||
# because our values include offsets inside of the function.
|
||||
if key in node_dict:
|
||||
node_dict[key].node_type = SymbolType.FUNCTION
|
||||
|
||||
# The LINES section contains every code line in the file, naturally.
|
||||
# There isn't an obvious separation between functions, so we have to
|
||||
# read everything. However, any function that would be in LINES
|
||||
# has to be somewhere else in the PDB (probably PUBLICS).
|
||||
# Isolate the lines that we actually care about for matching.
|
||||
self.verified_lines = {
|
||||
key: value for (key, value) in parser.lines.items() if key in node_dict
|
||||
}
|
||||
|
||||
for sym in parser.symbols:
|
||||
key = (sym.section, sym.offset)
|
||||
if key not in node_dict:
|
||||
node_dict[key] = CvdumpNode(*key)
|
||||
|
||||
if sym.type == "S_GPROC32":
|
||||
node_dict[key].friendly_name = sym.name
|
||||
node_dict[key].confirmed_size = sym.size
|
||||
node_dict[key].node_type = SymbolType.FUNCTION
|
||||
node_dict[key].symbol_entry = sym
|
||||
|
||||
self.nodes: List[CvdumpNode] = [
|
||||
v for _, v in dict(sorted(node_dict.items())).items()
|
||||
]
|
||||
self._estimate_size()
|
||||
|
||||
def _estimate_size(self):
|
||||
"""Get the distance between one section:offset value and the next one
|
||||
in the same section. This gives a rough estimate of the size of the symbol.
|
||||
If we have information from SECTION CONTRIBUTIONS, take whichever one is
|
||||
less to get the best approximate size."""
|
||||
for i in range(len(self.nodes) - 1):
|
||||
this_node = self.nodes[i]
|
||||
next_node = self.nodes[i + 1]
|
||||
|
||||
# If they are in different sections, we can't compare them
|
||||
if this_node.section != next_node.section:
|
||||
continue
|
||||
|
||||
this_node.estimated_size = next_node.offset - this_node.offset
|
@ -1,121 +0,0 @@
|
||||
"""For demangling a subset of MSVC mangled symbols.
|
||||
Some unofficial information about the mangling scheme is here:
|
||||
https://en.wikiversity.org/wiki/Visual_C%2B%2B_name_mangling
|
||||
"""
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from typing import Optional
|
||||
import pydemangler
|
||||
|
||||
|
||||
class InvalidEncodedNumberError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
_encoded_number_translate = str.maketrans("ABCDEFGHIJKLMNOP", "0123456789ABCDEF")
|
||||
|
||||
|
||||
def parse_encoded_number(string: str) -> int:
|
||||
# TODO: assert string ends in "@"?
|
||||
if string.endswith("@"):
|
||||
string = string[:-1]
|
||||
|
||||
try:
|
||||
return int(string.translate(_encoded_number_translate), 16)
|
||||
except ValueError as e:
|
||||
raise InvalidEncodedNumberError(string) from e
|
||||
|
||||
|
||||
string_const_regex = re.compile(
|
||||
r"\?\?_C@\_(?P<is_utf16>[0-1])(?P<len>\d|[A-P]+@)(?P<hash>\w+)@(?P<value>.+)@"
|
||||
)
|
||||
StringConstInfo = namedtuple("StringConstInfo", "len is_utf16")
|
||||
|
||||
|
||||
def demangle_string_const(symbol: str) -> Optional[StringConstInfo]:
|
||||
"""Don't bother to decode the string text from the symbol.
|
||||
We can just read it from the binary once we have the length."""
|
||||
match = string_const_regex.match(symbol)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
strlen = (
|
||||
parse_encoded_number(match.group("len"))
|
||||
if "@" in match.group("len")
|
||||
else int(match.group("len"))
|
||||
)
|
||||
except (ValueError, InvalidEncodedNumberError):
|
||||
return None
|
||||
|
||||
is_utf16 = match.group("is_utf16") == "1"
|
||||
return StringConstInfo(len=strlen, is_utf16=is_utf16)
|
||||
|
||||
|
||||
def get_vtordisp_name(symbol: str) -> Optional[str]:
|
||||
# pylint: disable=c-extension-no-member
|
||||
"""For adjuster thunk functions, the PDB will sometimes use a name
|
||||
that contains "vtordisp" but often will just reuse the name of the
|
||||
function being thunked. We want to use the vtordisp name if possible."""
|
||||
name = pydemangler.demangle(symbol)
|
||||
if name is None:
|
||||
return None
|
||||
|
||||
if "`vtordisp" not in name:
|
||||
return None
|
||||
|
||||
# Now we remove the parts of the friendly name that we don't need
|
||||
try:
|
||||
# Assuming this is the last of the function prefixes
|
||||
thiscall_idx = name.index("__thiscall")
|
||||
# To match the end of the `vtordisp{x,y}' string
|
||||
end_idx = name.index("}'")
|
||||
return name[thiscall_idx + 11 : end_idx + 2]
|
||||
except ValueError:
|
||||
return name
|
||||
|
||||
|
||||
def demangle_vtable(symbol: str) -> str:
|
||||
# pylint: disable=c-extension-no-member
|
||||
"""Get the class name referenced in the vtable symbol."""
|
||||
raw = pydemangler.demangle(symbol)
|
||||
|
||||
if raw is None:
|
||||
pass # TODO: This shouldn't happen if MSVC behaves
|
||||
|
||||
# Remove storage class and other stuff we don't care about
|
||||
return (
|
||||
raw.replace("<class ", "<")
|
||||
.replace("<struct ", "<")
|
||||
.replace("const ", "")
|
||||
.replace("volatile ", "")
|
||||
)
|
||||
|
||||
|
||||
def demangle_vtable_ourselves(symbol: str) -> str:
|
||||
"""Parked implementation of MSVC symbol demangling.
|
||||
We only use this for vtables and it works okay with the simple cases or
|
||||
templates that refer to other classes/structs. Some namespace support.
|
||||
Does not support backrefs, primitive types, or vtables with
|
||||
virtual inheritance."""
|
||||
|
||||
# Seek ahead 4 chars to strip off "??_7" prefix
|
||||
t = symbol[4:].split("@")
|
||||
# "?$" indicates a template class
|
||||
if t[0].startswith("?$"):
|
||||
class_name = t[0][2:]
|
||||
# PA = Pointer/reference
|
||||
# V or U = class or struct
|
||||
if t[1].startswith("PA"):
|
||||
generic = f"{t[1][3:]} *"
|
||||
else:
|
||||
generic = t[1][1:]
|
||||
|
||||
return f"{class_name}<{generic}>::`vftable'"
|
||||
|
||||
# If we have two classes listed, it is a namespace hierarchy.
|
||||
# @@6B@ is a common generic suffix for these vtable symbols.
|
||||
if t[1] != "" and t[1] != "6B":
|
||||
return t[1] + "::" + t[0] + "::`vftable'"
|
||||
|
||||
return t[0] + "::`vftable'"
|
@ -1,182 +0,0 @@
|
||||
import re
|
||||
from typing import Iterable, Tuple
|
||||
from collections import namedtuple
|
||||
from .types import CvdumpTypesParser
|
||||
from .symbols import CvdumpSymbolsParser
|
||||
|
||||
# e.g. `*** PUBLICS`
|
||||
_section_change_regex = re.compile(r"\*\*\* (?P<section>[A-Z/ ]{2,})")
|
||||
|
||||
# e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4`
|
||||
_line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
|
||||
|
||||
# We assume no spaces in the file name
|
||||
# e.g. ` Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
|
||||
_lines_subsection_header = re.compile(
|
||||
r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
|
||||
)
|
||||
|
||||
# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
|
||||
_publics_line_regex = re.compile(
|
||||
r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
|
||||
)
|
||||
|
||||
# e.g. ` Debug start: 00000008, Debug end: 0000016E`
|
||||
_gproc_debug_regex = re.compile(
|
||||
r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
|
||||
)
|
||||
|
||||
# e.g. ` 00DA 0001:00000000 00000073 60501020`
|
||||
_section_contrib_regex = re.compile(
|
||||
r"\s*(?P<module>\w{4}) (?P<section>\w{4}):(?P<offset>\w{8}) (?P<size>\w{8}) (?P<flags>\w{8})"
|
||||
)
|
||||
|
||||
# e.g. `S_GDATA32: [0003:000004A4], Type: T_32PRCHAR(0470), g_set`
|
||||
_gdata32_regex = re.compile(
|
||||
r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)"
|
||||
)
|
||||
|
||||
# e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res"
|
||||
# e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj"
|
||||
_module_regex = re.compile(r"(?P<id>\w{4})(?: \"(?P<lib>.+?)\")?(?: \"(?P<obj>.+?)\")")
|
||||
|
||||
# User functions only
|
||||
LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
|
||||
|
||||
# Strings, vtables, functions
|
||||
# superset of everything else
|
||||
# only place you can find the C symbols (library functions, smacker, etc)
|
||||
PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
|
||||
|
||||
# (Estimated) size of any symbol
|
||||
SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")
|
||||
|
||||
# global variables
|
||||
GdataEntry = namedtuple("GdataEntry", "section offset type name")
|
||||
|
||||
ModuleEntry = namedtuple("ModuleEntry", "id lib obj")
|
||||
|
||||
|
||||
class CvdumpParser:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
def __init__(self) -> None:
|
||||
self._section: str = ""
|
||||
self._lines_function: Tuple[str, int] = ("", 0)
|
||||
|
||||
self.lines = {}
|
||||
self.publics = []
|
||||
self.sizerefs = []
|
||||
self.globals = []
|
||||
self.modules = []
|
||||
|
||||
self.types = CvdumpTypesParser()
|
||||
self.symbols_parser = CvdumpSymbolsParser()
|
||||
|
||||
@property
|
||||
def symbols(self):
|
||||
return self.symbols_parser.symbols
|
||||
|
||||
def _lines_section(self, line: str):
|
||||
"""Parsing entries from the LINES section. We only care about the pairs of
|
||||
line_number and address and the subsection header to indicate which code file
|
||||
we are in."""
|
||||
|
||||
# Subheader indicates a new function and possibly a new code filename.
|
||||
# Save the section here because it is not given on the lines that follow.
|
||||
if (match := _lines_subsection_header.match(line)) is not None:
|
||||
self._lines_function = (
|
||||
match.group("filename"),
|
||||
int(match.group("section"), 16),
|
||||
)
|
||||
return
|
||||
|
||||
# Match any pairs as we find them
|
||||
for line_no, offset in _line_addr_pairs_findall.findall(line):
|
||||
key = (self._lines_function[1], int(offset, 16))
|
||||
self.lines[key] = (self._lines_function[0], int(line_no))
|
||||
|
||||
def _publics_section(self, line: str):
|
||||
"""Match each line from PUBLICS and pull out the symbol information.
|
||||
These are MSVC mangled symbol names. String constants and vtable
|
||||
addresses can only be found here."""
|
||||
if (match := _publics_line_regex.match(line)) is not None:
|
||||
self.publics.append(
|
||||
PublicsEntry(
|
||||
type=match.group("type"),
|
||||
section=int(match.group("section"), 16),
|
||||
offset=int(match.group("offset"), 16),
|
||||
flags=int(match.group("flags"), 16),
|
||||
name=match.group("name"),
|
||||
)
|
||||
)
|
||||
|
||||
def _globals_section(self, line: str):
|
||||
"""S_PROCREF may be useful later.
|
||||
Right now we just want S_GDATA32 symbols because it is the simplest
|
||||
way to access global variables."""
|
||||
if (match := _gdata32_regex.match(line)) is not None:
|
||||
self.globals.append(
|
||||
GdataEntry(
|
||||
section=int(match.group("section"), 16),
|
||||
offset=int(match.group("offset"), 16),
|
||||
type=match.group("type"),
|
||||
name=match.group("name"),
|
||||
)
|
||||
)
|
||||
|
||||
def _section_contributions(self, line: str):
|
||||
"""Gives the size of elements across all sections of the binary.
|
||||
This is the easiest way to get the data size for .data and .rdata
|
||||
members that do not have a primitive data type."""
|
||||
if (match := _section_contrib_regex.match(line)) is not None:
|
||||
self.sizerefs.append(
|
||||
SizeRefEntry(
|
||||
module=int(match.group("module"), 16),
|
||||
section=int(match.group("section"), 16),
|
||||
offset=int(match.group("offset"), 16),
|
||||
size=int(match.group("size"), 16),
|
||||
)
|
||||
)
|
||||
|
||||
def _modules_section(self, line: str):
|
||||
"""Record the object file (and lib file, if used) linked into the binary.
|
||||
The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS
|
||||
(and perhaps other locations)"""
|
||||
if (match := _module_regex.match(line)) is not None:
|
||||
self.modules.append(
|
||||
ModuleEntry(
|
||||
id=int(match.group("id"), 16),
|
||||
lib=match.group("lib"),
|
||||
obj=match.group("obj"),
|
||||
)
|
||||
)
|
||||
|
||||
def read_line(self, line: str):
|
||||
if (match := _section_change_regex.match(line)) is not None:
|
||||
self._section = match.group(1)
|
||||
return
|
||||
|
||||
if self._section == "TYPES":
|
||||
self.types.read_line(line)
|
||||
|
||||
elif self._section == "SYMBOLS":
|
||||
self.symbols_parser.read_line(line)
|
||||
|
||||
elif self._section == "LINES":
|
||||
self._lines_section(line)
|
||||
|
||||
elif self._section == "PUBLICS":
|
||||
self._publics_section(line)
|
||||
|
||||
elif self._section == "SECTION CONTRIBUTIONS":
|
||||
self._section_contributions(line)
|
||||
|
||||
elif self._section == "GLOBALS":
|
||||
self._globals_section(line)
|
||||
|
||||
elif self._section == "MODULES":
|
||||
self._modules_section(line)
|
||||
|
||||
def read_lines(self, lines: Iterable[str]):
|
||||
for line in lines:
|
||||
self.read_line(line)
|
@ -1,83 +0,0 @@
|
||||
import io
|
||||
from os import name as os_name
|
||||
from enum import Enum
|
||||
from typing import List
|
||||
import subprocess
|
||||
from isledecomp.lib import lib_path_join
|
||||
from isledecomp.dir import winepath_unix_to_win
|
||||
from .parser import CvdumpParser
|
||||
|
||||
|
||||
class DumpOpt(Enum):
|
||||
LINES = 0
|
||||
SYMBOLS = 1
|
||||
GLOBALS = 2
|
||||
PUBLICS = 3
|
||||
SECTION_CONTRIB = 4
|
||||
MODULES = 5
|
||||
TYPES = 6
|
||||
|
||||
|
||||
cvdump_opt_map = {
|
||||
DumpOpt.LINES: "-l",
|
||||
DumpOpt.SYMBOLS: "-s",
|
||||
DumpOpt.GLOBALS: "-g",
|
||||
DumpOpt.PUBLICS: "-p",
|
||||
DumpOpt.SECTION_CONTRIB: "-seccontrib",
|
||||
DumpOpt.MODULES: "-m",
|
||||
DumpOpt.TYPES: "-t",
|
||||
}
|
||||
|
||||
|
||||
class Cvdump:
|
||||
def __init__(self, pdb: str) -> None:
|
||||
self._pdb: str = pdb
|
||||
self._options = set()
|
||||
|
||||
def lines(self):
|
||||
self._options.add(DumpOpt.LINES)
|
||||
return self
|
||||
|
||||
def symbols(self):
|
||||
self._options.add(DumpOpt.SYMBOLS)
|
||||
return self
|
||||
|
||||
def globals(self):
|
||||
self._options.add(DumpOpt.GLOBALS)
|
||||
return self
|
||||
|
||||
def publics(self):
|
||||
self._options.add(DumpOpt.PUBLICS)
|
||||
return self
|
||||
|
||||
def section_contributions(self):
|
||||
self._options.add(DumpOpt.SECTION_CONTRIB)
|
||||
return self
|
||||
|
||||
def modules(self):
|
||||
self._options.add(DumpOpt.MODULES)
|
||||
return self
|
||||
|
||||
def types(self):
|
||||
self._options.add(DumpOpt.TYPES)
|
||||
return self
|
||||
|
||||
def cmd_line(self) -> List[str]:
|
||||
cvdump_exe = lib_path_join("cvdump.exe")
|
||||
flags = [cvdump_opt_map[opt] for opt in self._options]
|
||||
|
||||
if os_name == "nt":
|
||||
return [cvdump_exe, *flags, self._pdb]
|
||||
|
||||
return ["wine", cvdump_exe, *flags, winepath_unix_to_win(self._pdb)]
|
||||
|
||||
def run(self) -> CvdumpParser:
|
||||
parser = CvdumpParser()
|
||||
call = self.cmd_line()
|
||||
with subprocess.Popen(call, stdout=subprocess.PIPE) as proc:
|
||||
for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
|
||||
# Blank lines are there to help the reader; they have no context significance
|
||||
if line != "\n":
|
||||
parser.read_line(line)
|
||||
|
||||
return parser
|
@ -1,162 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
import re
|
||||
from re import Match
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackOrRegisterSymbol(NamedTuple):
|
||||
symbol_type: str
|
||||
location: str
|
||||
"""Should always be set/converted to lowercase."""
|
||||
data_type: str
|
||||
name: str
|
||||
|
||||
|
||||
# S_GPROC32 = functions
|
||||
@dataclass
|
||||
class SymbolsEntry:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
type: str
|
||||
section: int
|
||||
offset: int
|
||||
size: int
|
||||
func_type: str
|
||||
name: str
|
||||
stack_symbols: list[StackOrRegisterSymbol] = field(default_factory=list)
|
||||
frame_pointer_present: bool = False
|
||||
addr: Optional[int] = None # Absolute address. Will be set later, if at all
|
||||
|
||||
|
||||
class CvdumpSymbolsParser:
|
||||
_symbol_line_generic_regex = re.compile(
|
||||
r"\(\w+\)\s+(?P<symbol_type>[^\s:]+)(?::\s+(?P<second_part>\S.*))?|(?::)$"
|
||||
)
|
||||
"""
|
||||
Parses the first part, e.g. `(00008C) S_GPROC32`, and splits off the second part after the colon (if it exists).
|
||||
There are three cases:
|
||||
- no colon, e.g. `(000350) S_END`
|
||||
- colon but no data, e.g. `(000370) S_COMPILE:`
|
||||
- colon and data, e.g. `(000304) S_REGISTER: esi, Type: 0x1E14, this``
|
||||
"""
|
||||
|
||||
_symbol_line_function_regex = re.compile(
|
||||
r"\[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+(?P<func_type>[^\s,]+), (?P<name>.+)"
|
||||
)
|
||||
"""
|
||||
Parses the second part of a function symbol, e.g.
|
||||
`[0001:00034E90], Cb: 00000007, Type: 0x1024, ViewROI::IntrinsicImportance`
|
||||
"""
|
||||
|
||||
# the second part of e.g.
|
||||
_stack_register_symbol_regex = re.compile(
|
||||
r"(?P<location>\S+), Type:\s+(?P<data_type>[\w()]+), (?P<name>.+)$"
|
||||
)
|
||||
"""
|
||||
Parses the second part of a stack or register symbol, e.g.
|
||||
`esi, Type: 0x1E14, this`
|
||||
"""
|
||||
|
||||
_debug_start_end_regex = re.compile(
|
||||
r"^\s*Debug start: (?P<debug_start>\w+), Debug end: (?P<debug_end>\w+)$"
|
||||
)
|
||||
|
||||
_parent_end_next_regex = re.compile(
|
||||
r"\s*Parent: (?P<parent_addr>\w+), End: (?P<end_addr>\w+), Next: (?P<next_addr>\w+)$"
|
||||
)
|
||||
|
||||
_flags_frame_pointer_regex = re.compile(r"\s*Flags: Frame Ptr Present$")
|
||||
|
||||
_register_stack_symbols = ["S_BPREL32", "S_REGISTER"]
|
||||
|
||||
# List the unhandled types so we can check exhaustiveness
|
||||
_unhandled_symbols = [
|
||||
"S_COMPILE",
|
||||
"S_OBJNAME",
|
||||
"S_THUNK32",
|
||||
"S_LABEL32",
|
||||
"S_LDATA32",
|
||||
"S_UDT",
|
||||
]
|
||||
|
||||
"""Parser for cvdump output, SYMBOLS section."""
|
||||
|
||||
def __init__(self):
|
||||
self.symbols: list[SymbolsEntry] = []
|
||||
self.current_function: Optional[SymbolsEntry] = None
|
||||
# If we read an S_BLOCK32 node, increment this level.
|
||||
# This is so we do not end the proc early by reading an S_END
|
||||
# that indicates the end of the block.
|
||||
self.block_level: int = 0
|
||||
|
||||
def read_line(self, line: str):
|
||||
if (match := self._symbol_line_generic_regex.match(line)) is not None:
|
||||
self._parse_generic_case(line, match)
|
||||
elif (match := self._parent_end_next_regex.match(line)) is not None:
|
||||
# We do not need this info at the moment, might be useful in the future
|
||||
pass
|
||||
elif (match := self._debug_start_end_regex.match(line)) is not None:
|
||||
# We do not need this info at the moment, might be useful in the future
|
||||
pass
|
||||
elif (match := self._flags_frame_pointer_regex.match(line)) is not None:
|
||||
if self.current_function is None:
|
||||
logger.error(
|
||||
"Found a `Flags: Frame Ptr Present` but self.current_function is None"
|
||||
)
|
||||
return
|
||||
self.current_function.frame_pointer_present = True
|
||||
else:
|
||||
# Most of these are either `** Module: [...]` or data we do not care about
|
||||
logger.debug("Unhandled line: %s", line[:-1])
|
||||
|
||||
def _parse_generic_case(self, line, line_match: Match[str]):
|
||||
symbol_type: str = line_match.group("symbol_type")
|
||||
second_part: Optional[str] = line_match.group("second_part")
|
||||
|
||||
if symbol_type in ["S_GPROC32", "S_LPROC32"]:
|
||||
assert second_part is not None
|
||||
if (match := self._symbol_line_function_regex.match(second_part)) is None:
|
||||
logger.error("Invalid function symbol: %s", line[:-1])
|
||||
return
|
||||
self.current_function = SymbolsEntry(
|
||||
type=symbol_type,
|
||||
section=int(match.group("section"), 16),
|
||||
offset=int(match.group("offset"), 16),
|
||||
size=int(match.group("size"), 16),
|
||||
func_type=match.group("func_type"),
|
||||
name=match.group("name"),
|
||||
)
|
||||
self.symbols.append(self.current_function)
|
||||
|
||||
elif symbol_type in self._register_stack_symbols:
|
||||
assert second_part is not None
|
||||
if self.current_function is None:
|
||||
logger.error("Found stack/register outside of function: %s", line[:-1])
|
||||
return
|
||||
if (match := self._stack_register_symbol_regex.match(second_part)) is None:
|
||||
logger.error("Invalid stack/register symbol: %s", line[:-1])
|
||||
return
|
||||
|
||||
new_symbol = StackOrRegisterSymbol(
|
||||
symbol_type=symbol_type,
|
||||
location=match.group("location").lower(),
|
||||
data_type=match.group("data_type"),
|
||||
name=match.group("name"),
|
||||
)
|
||||
self.current_function.stack_symbols.append(new_symbol)
|
||||
|
||||
elif symbol_type == "S_BLOCK32":
|
||||
self.block_level += 1
|
||||
elif symbol_type == "S_END":
|
||||
if self.block_level > 0:
|
||||
self.block_level -= 1
|
||||
assert self.block_level >= 0
|
||||
else:
|
||||
self.current_function = None
|
||||
elif symbol_type in self._unhandled_symbols:
|
||||
return
|
||||
else:
|
||||
logger.error("Unhandled symbol type: %s", line)
|
@ -1,737 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
import logging
|
||||
from typing import Any, Dict, List, NamedTuple, Optional
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CvdumpTypeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CvdumpKeyError(KeyError):
|
||||
pass
|
||||
|
||||
|
||||
class CvdumpIntegrityError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class FieldListItem(NamedTuple):
|
||||
"""Member of a class or structure"""
|
||||
|
||||
offset: int
|
||||
name: str
|
||||
type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class VirtualBaseClass:
|
||||
type: str
|
||||
index: int
|
||||
direct: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class VirtualBasePointer:
|
||||
vboffset: int
|
||||
bases: list[VirtualBaseClass]
|
||||
|
||||
|
||||
class ScalarType(NamedTuple):
|
||||
offset: int
|
||||
name: Optional[str]
|
||||
type: str
|
||||
|
||||
@property
|
||||
def size(self) -> int:
|
||||
return scalar_type_size(self.type)
|
||||
|
||||
@property
|
||||
def format_char(self) -> str:
|
||||
return scalar_type_format_char(self.type)
|
||||
|
||||
@property
|
||||
def is_pointer(self) -> bool:
|
||||
return scalar_type_pointer(self.type)
|
||||
|
||||
|
||||
class TypeInfo(NamedTuple):
|
||||
key: str
|
||||
size: Optional[int]
|
||||
name: Optional[str] = None
|
||||
members: Optional[List[FieldListItem]] = None
|
||||
|
||||
def is_scalar(self) -> bool:
|
||||
# TODO: distinction between a class with zero members and no vtable?
|
||||
return self.members is None
|
||||
|
||||
|
||||
def normalize_type_id(key: str) -> str:
|
||||
"""Helper for TYPES parsing to ensure a consistent format.
|
||||
If key begins with "T_" it is a built-in type.
|
||||
Else it is a hex string. We prefer lower case letters and
|
||||
no leading zeroes. (UDT identifier pads to 8 characters.)"""
|
||||
if key[0] == "0":
|
||||
return f"0x{key[-4:].lower()}"
|
||||
|
||||
# Remove numeric value for "T_" type. We don't use this.
|
||||
return key.partition("(")[0]
|
||||
|
||||
|
||||
def scalar_type_pointer(type_name: str) -> bool:
|
||||
return type_name.startswith("T_32P")
|
||||
|
||||
|
||||
def scalar_type_size(type_name: str) -> int:
|
||||
if scalar_type_pointer(type_name):
|
||||
return 4
|
||||
|
||||
if "CHAR" in type_name:
|
||||
return 2 if "WCHAR" in type_name else 1
|
||||
|
||||
if "SHORT" in type_name:
|
||||
return 2
|
||||
|
||||
if "QUAD" in type_name or "64" in type_name:
|
||||
return 8
|
||||
|
||||
return 4
|
||||
|
||||
|
||||
def scalar_type_signed(type_name: str) -> bool:
|
||||
if scalar_type_pointer(type_name):
|
||||
return False
|
||||
|
||||
# According to cvinfo.h, T_WCHAR is unsigned
|
||||
return not type_name.startswith("T_U") and not type_name.startswith("T_W")
|
||||
|
||||
|
||||
def scalar_type_format_char(type_name: str) -> str:
|
||||
if scalar_type_pointer(type_name):
|
||||
return "L"
|
||||
|
||||
# "Really a char"
|
||||
if type_name.startswith("T_RCHAR"):
|
||||
return "c"
|
||||
|
||||
# floats
|
||||
if type_name.startswith("T_REAL"):
|
||||
return "d" if "64" in type_name else "f"
|
||||
|
||||
size = scalar_type_size(type_name)
|
||||
char = ({1: "b", 2: "h", 4: "l", 8: "q"}).get(size, "l")
|
||||
|
||||
return char if scalar_type_signed(type_name) else char.upper()
|
||||
|
||||
|
||||
def member_list_to_struct_string(members: List[ScalarType]) -> str:
|
||||
"""Create a string for use with struct.unpack"""
|
||||
|
||||
format_string = "".join(m.format_char for m in members)
|
||||
if len(format_string) > 0:
|
||||
return "<" + format_string
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def join_member_names(parent: str, child: Optional[str]) -> str:
|
||||
"""Helper method to combine parent/child member names.
|
||||
Child member name is None if the child is a scalar type."""
|
||||
|
||||
if child is None:
|
||||
return parent
|
||||
|
||||
# If the child is an array index, join without the dot
|
||||
if child.startswith("["):
|
||||
return f"{parent}{child}"
|
||||
|
||||
return f"{parent}.{child}"
|
||||
|
||||
|
||||
class CvdumpTypesParser:
|
||||
"""Parser for cvdump output, TYPES section.
|
||||
Tricky enough that it demands its own parser."""
|
||||
|
||||
# Marks the start of a new type
|
||||
INDEX_RE = re.compile(r"(?P<key>0x\w+) : .* (?P<type>LF_\w+)")
|
||||
|
||||
# LF_FIELDLIST class/struct member (1/2)
|
||||
LIST_RE = re.compile(
|
||||
r"\s+list\[\d+\] = LF_MEMBER, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
|
||||
)
|
||||
|
||||
# LF_FIELDLIST vtable indicator
|
||||
VTABLE_RE = re.compile(r"^\s+list\[\d+\] = LF_VFUNCTAB")
|
||||
|
||||
# LF_FIELDLIST superclass indicator
|
||||
SUPERCLASS_RE = re.compile(
|
||||
r"^\s+list\[\d+\] = LF_BCLASS, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
|
||||
)
|
||||
|
||||
# LF_FIELDLIST virtual direct/indirect base pointer, line 1/2
|
||||
VBCLASS_RE = re.compile(
|
||||
r"^\s+list\[\d+\] = LF_(?P<indirect>I?)VBCLASS, .* base type = (?P<type>.*)$"
|
||||
)
|
||||
|
||||
# LF_FIELDLIST virtual direct/indirect base pointer, line 2/2
|
||||
VBCLASS_LINE_2_RE = re.compile(
|
||||
r"^\s+virtual base ptr = .+, vbpoff = (?P<vboffset>\d+), vbind = (?P<vbindex>\d+)$"
|
||||
)
|
||||
|
||||
# LF_FIELDLIST member name (2/2)
|
||||
MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")
|
||||
|
||||
LF_FIELDLIST_ENUMERATE = re.compile(
|
||||
r"^\s+list\[\d+\] = LF_ENUMERATE,.*value = (?P<value>\d+), name = '(?P<name>[^']+)'$"
|
||||
)
|
||||
|
||||
# LF_ARRAY element type
|
||||
ARRAY_ELEMENT_RE = re.compile(r"^\s+Element type = (?P<type>.*)")
|
||||
|
||||
# LF_ARRAY total array size
|
||||
ARRAY_LENGTH_RE = re.compile(r"^\s+length = (?P<length>\d+)")
|
||||
|
||||
# LF_CLASS/LF_STRUCTURE field list reference
|
||||
CLASS_FIELD_RE = re.compile(
|
||||
r"^\s+# members = \d+, field list type (?P<field_type>0x\w+),"
|
||||
)
|
||||
|
||||
# LF_CLASS/LF_STRUCTURE name and other info
|
||||
CLASS_NAME_RE = re.compile(
|
||||
r"^\s+Size = (?P<size>\d+), class name = (?P<name>(?:[^,]|,\S)+)(?:, UDT\((?P<udt>0x\w+)\))?"
|
||||
)
|
||||
|
||||
# LF_MODIFIER, type being modified
|
||||
MODIFIES_RE = re.compile(r".*modifies type (?P<type>.*)$")
|
||||
|
||||
# LF_ARGLIST number of entries
|
||||
LF_ARGLIST_ARGCOUNT = re.compile(r".*argument count = (?P<argcount>\d+)$")
|
||||
|
||||
# LF_ARGLIST list entry
|
||||
LF_ARGLIST_ENTRY = re.compile(
|
||||
r"^\s+list\[(?P<index>\d+)\] = (?P<arg_type>[\w()]+)$"
|
||||
)
|
||||
|
||||
# LF_POINTER element
|
||||
LF_POINTER_ELEMENT = re.compile(r"^\s+Element type : (?P<element_type>.+)$")
|
||||
|
||||
# LF_MFUNCTION attribute key-value pairs
|
||||
LF_MFUNCTION_ATTRIBUTES = [
|
||||
re.compile(r"\s*Return type = (?P<return_type>[\w()]+)$"),
|
||||
re.compile(r"\s*Class type = (?P<class_type>[\w()]+)$"),
|
||||
re.compile(r"\s*This type = (?P<this_type>[\w()]+)$"),
|
||||
# Call type may contain whitespace
|
||||
re.compile(r"\s*Call type = (?P<call_type>[\w()\s]+)$"),
|
||||
re.compile(r"\s*Parms = (?P<num_params>[\w()]+)$"), # LF_MFUNCTION only
|
||||
re.compile(r"\s*# Parms = (?P<num_params>[\w()]+)$"), # LF_PROCEDURE only
|
||||
re.compile(r"\s*Arg list type = (?P<arg_list_type>[\w()]+)$"),
|
||||
re.compile(
|
||||
r"\s*This adjust = (?P<this_adjust>[\w()]+)$"
|
||||
), # By how much the incoming pointers are shifted in virtual inheritance; hex value without `0x` prefix
|
||||
re.compile(
|
||||
r"\s*Func attr = (?P<func_attr>[\w()]+)$"
|
||||
), # Only for completeness, is always `none`
|
||||
]
|
||||
|
||||
LF_ENUM_ATTRIBUTES = [
|
||||
re.compile(r"^\s*# members = (?P<num_members>\d+)$"),
|
||||
re.compile(r"^\s*enum name = (?P<name>.+)$"),
|
||||
]
|
||||
LF_ENUM_TYPES = re.compile(
|
||||
r"^\s*type = (?P<underlying_type>\S+) field list type (?P<field_type>0x\w{4})$"
|
||||
)
|
||||
LF_ENUM_UDT = re.compile(r"^\s*UDT\((?P<udt>0x\w+)\)$")
|
||||
LF_UNION_LINE = re.compile(
|
||||
r"^.*field list type (?P<field_type>0x\w+),.*Size = (?P<size>\d+)\s*,class name = (?P<name>(?:[^,]|,\S)+)(?:,\s.*UDT\((?P<udt>0x\w+)\))?$"
|
||||
)
|
||||
|
||||
MODES_OF_INTEREST = {
|
||||
"LF_ARRAY",
|
||||
"LF_CLASS",
|
||||
"LF_ENUM",
|
||||
"LF_FIELDLIST",
|
||||
"LF_MODIFIER",
|
||||
"LF_POINTER",
|
||||
"LF_STRUCTURE",
|
||||
"LF_ARGLIST",
|
||||
"LF_MFUNCTION",
|
||||
"LF_PROCEDURE",
|
||||
"LF_UNION",
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.mode: Optional[str] = None
|
||||
self.last_key = ""
|
||||
self.keys: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
def _new_type(self):
|
||||
"""Prepare a new dict for the type we just parsed.
|
||||
The id is self.last_key and the "type" of type is self.mode.
|
||||
e.g. LF_CLASS"""
|
||||
self.keys[self.last_key] = {"type": self.mode}
|
||||
|
||||
def _set(self, key: str, value):
|
||||
self.keys[self.last_key][key] = value
|
||||
|
||||
def _add_member(self, offset: int, type_: str):
|
||||
obj = self.keys[self.last_key]
|
||||
if "members" not in obj:
|
||||
obj["members"] = []
|
||||
|
||||
obj["members"].append({"offset": offset, "type": type_})
|
||||
|
||||
def _set_member_name(self, name: str):
|
||||
"""Set name for most recently added member."""
|
||||
obj = self.keys[self.last_key]
|
||||
obj["members"][-1]["name"] = name
|
||||
|
||||
def _add_variant(self, name: str, value: int):
|
||||
obj = self.keys[self.last_key]
|
||||
if "variants" not in obj:
|
||||
obj["variants"] = []
|
||||
variants: list[dict[str, Any]] = obj["variants"]
|
||||
variants.append({"name": name, "value": value})
|
||||
|
||||
def _get_field_list(self, type_obj: Dict[str, Any]) -> List[FieldListItem]:
|
||||
"""Return the field list for the given LF_CLASS/LF_STRUCTURE reference"""
|
||||
|
||||
if type_obj.get("type") == "LF_FIELDLIST":
|
||||
field_obj = type_obj
|
||||
else:
|
||||
field_list_type = type_obj["field_list_type"]
|
||||
field_obj = self.keys[field_list_type]
|
||||
|
||||
members: List[FieldListItem] = []
|
||||
|
||||
super_ids = field_obj.get("super", [])
|
||||
for super_id in super_ids:
|
||||
# May need to resolve forward ref.
|
||||
superclass = self.get(super_id)
|
||||
if superclass.members is not None:
|
||||
members += superclass.members
|
||||
|
||||
raw_members = field_obj.get("members", [])
|
||||
members += [
|
||||
FieldListItem(
|
||||
offset=m["offset"],
|
||||
type=m["type"],
|
||||
name=m["name"],
|
||||
)
|
||||
for m in raw_members
|
||||
]
|
||||
|
||||
return sorted(members, key=lambda m: m.offset)
|
||||
|
||||
def _mock_array_members(self, type_obj: Dict) -> List[FieldListItem]:
|
||||
"""LF_ARRAY elements provide the element type and the total size.
|
||||
We want the list of "members" as if this was a struct."""
|
||||
|
||||
if type_obj.get("type") != "LF_ARRAY":
|
||||
raise CvdumpTypeError("Type is not an LF_ARRAY")
|
||||
|
||||
array_type = type_obj.get("array_type")
|
||||
if array_type is None:
|
||||
raise CvdumpIntegrityError("No array element type")
|
||||
|
||||
array_element_size = self.get(array_type).size
|
||||
assert (
|
||||
array_element_size is not None
|
||||
), "Encountered an array whose type has no size"
|
||||
|
||||
n_elements = type_obj["size"] // array_element_size
|
||||
|
||||
return [
|
||||
FieldListItem(
|
||||
offset=i * array_element_size,
|
||||
type=array_type,
|
||||
name=f"[{i}]",
|
||||
)
|
||||
for i in range(n_elements)
|
||||
]
|
||||
|
||||
def get(self, type_key: str) -> TypeInfo:
|
||||
"""Convert our dictionary values read from the cvdump output
|
||||
into a consistent format for the given type."""
|
||||
|
||||
# Scalar type. Handled here because it makes the recursive steps
|
||||
# much simpler.
|
||||
if type_key.startswith("T_"):
|
||||
size = scalar_type_size(type_key)
|
||||
return TypeInfo(
|
||||
key=type_key,
|
||||
size=size,
|
||||
)
|
||||
|
||||
# Go to our dictionary to find it.
|
||||
obj = self.keys.get(type_key.lower())
|
||||
if obj is None:
|
||||
raise CvdumpKeyError(type_key)
|
||||
|
||||
# These type references are just a wrapper around a scalar
|
||||
if obj.get("type") == "LF_ENUM":
|
||||
underlying_type = obj.get("underlying_type")
|
||||
if underlying_type is None:
|
||||
raise CvdumpKeyError(f"Missing 'underlying_type' in {obj}")
|
||||
return self.get(underlying_type)
|
||||
|
||||
if obj.get("type") == "LF_POINTER":
|
||||
return self.get("T_32PVOID")
|
||||
|
||||
if obj.get("is_forward_ref", False):
|
||||
# Get the forward reference to follow.
|
||||
# If this is LF_CLASS/LF_STRUCTURE, it is the UDT value.
|
||||
# For LF_MODIFIER, it is the type being modified.
|
||||
forward_ref = obj.get("udt", None) or obj.get("modifies", None)
|
||||
if forward_ref is None:
|
||||
raise CvdumpIntegrityError(f"Null forward ref for type {type_key}")
|
||||
|
||||
return self.get(forward_ref)
|
||||
|
||||
# Else it is not a forward reference, so build out the object here.
|
||||
if obj.get("type") == "LF_ARRAY":
|
||||
members = self._mock_array_members(obj)
|
||||
else:
|
||||
members = self._get_field_list(obj)
|
||||
|
||||
return TypeInfo(
|
||||
key=type_key,
|
||||
size=obj.get("size"),
|
||||
name=obj.get("name"),
|
||||
members=members,
|
||||
)
|
||||
|
||||
def get_by_name(self, name: str) -> TypeInfo:
|
||||
"""Find the complex type with the given name."""
|
||||
# TODO
|
||||
raise NotImplementedError
|
||||
|
||||
def get_scalars(self, type_key: str) -> List[ScalarType]:
|
||||
"""Reduce the given type to a list of scalars so we can
|
||||
compare each component value."""
|
||||
|
||||
obj = self.get(type_key)
|
||||
if obj.is_scalar():
|
||||
# Use obj.key here for alias types like LF_POINTER
|
||||
return [ScalarType(offset=0, type=obj.key, name=None)]
|
||||
|
||||
# mypy?
|
||||
assert obj.members is not None
|
||||
|
||||
# Dedupe repeated offsets if this is a union type
|
||||
unique_offsets = {m.offset: m for m in obj.members}
|
||||
unique_members = [m for _, m in unique_offsets.items()]
|
||||
|
||||
return [
|
||||
ScalarType(
|
||||
offset=m.offset + cm.offset,
|
||||
type=cm.type,
|
||||
name=join_member_names(m.name, cm.name),
|
||||
)
|
||||
for m in unique_members
|
||||
for cm in self.get_scalars(m.type)
|
||||
]
|
||||
|
||||
def get_scalars_gapless(self, type_key: str) -> List[ScalarType]:
|
||||
"""Reduce the given type to a list of scalars so we can
|
||||
compare each component value."""
|
||||
|
||||
obj = self.get(type_key)
|
||||
total_size = obj.size
|
||||
assert (
|
||||
total_size is not None
|
||||
), "Called get_scalar_gapless() on a type without size"
|
||||
|
||||
scalars = self.get_scalars(type_key)
|
||||
|
||||
output = []
|
||||
last_extent = total_size
|
||||
|
||||
# Walk the scalar list in reverse; we assume a gap could not
|
||||
# come at the start of the struct.
|
||||
for scalar in scalars[::-1]:
|
||||
this_extent = scalar.offset + scalar_type_size(scalar.type)
|
||||
size_diff = last_extent - this_extent
|
||||
# We need to add the gap fillers in reverse here
|
||||
for i in range(size_diff - 1, -1, -1):
|
||||
# Push to front
|
||||
output.insert(
|
||||
0,
|
||||
ScalarType(
|
||||
offset=this_extent + i,
|
||||
name="(padding)",
|
||||
type="T_UCHAR",
|
||||
),
|
||||
)
|
||||
|
||||
output.insert(0, scalar)
|
||||
last_extent = scalar.offset
|
||||
|
||||
return output
|
||||
|
||||
def get_format_string(self, type_key: str) -> str:
|
||||
members = self.get_scalars_gapless(type_key)
|
||||
return member_list_to_struct_string(members)
|
||||
|
||||
def read_line(self, line: str):
|
||||
if line.endswith("\n"):
|
||||
line = line[:-1]
|
||||
if len(line) == 0:
|
||||
return
|
||||
|
||||
if (match := self.INDEX_RE.match(line)) is not None:
|
||||
type_ = match.group(2)
|
||||
if type_ not in self.MODES_OF_INTEREST:
|
||||
self.mode = None
|
||||
return
|
||||
|
||||
# Don't need to normalize, it's already in the format we want
|
||||
self.last_key = match.group(1)
|
||||
self.mode = type_
|
||||
self._new_type()
|
||||
|
||||
if type_ == "LF_ARGLIST":
|
||||
submatch = self.LF_ARGLIST_ARGCOUNT.match(line)
|
||||
assert submatch is not None
|
||||
self.keys[self.last_key]["argcount"] = int(submatch.group("argcount"))
|
||||
# TODO: This should be validated in another pass
|
||||
return
|
||||
|
||||
if self.mode is None:
|
||||
return
|
||||
|
||||
if self.mode == "LF_MODIFIER":
|
||||
if (match := self.MODIFIES_RE.match(line)) is not None:
|
||||
# For convenience, because this is essentially the same thing
|
||||
# as an LF_CLASS forward ref.
|
||||
self._set("is_forward_ref", True)
|
||||
self._set("modifies", normalize_type_id(match.group("type")))
|
||||
|
||||
elif self.mode == "LF_ARRAY":
|
||||
if (match := self.ARRAY_ELEMENT_RE.match(line)) is not None:
|
||||
self._set("array_type", normalize_type_id(match.group("type")))
|
||||
|
||||
elif (match := self.ARRAY_LENGTH_RE.match(line)) is not None:
|
||||
self._set("size", int(match.group("length")))
|
||||
|
||||
elif self.mode == "LF_FIELDLIST":
|
||||
self.read_fieldlist_line(line)
|
||||
|
||||
elif self.mode == "LF_ARGLIST":
|
||||
self.read_arglist_line(line)
|
||||
|
||||
elif self.mode in ["LF_MFUNCTION", "LF_PROCEDURE"]:
|
||||
self.read_mfunction_line(line)
|
||||
|
||||
elif self.mode in ["LF_CLASS", "LF_STRUCTURE"]:
|
||||
self.read_class_or_struct_line(line)
|
||||
|
||||
elif self.mode == "LF_POINTER":
|
||||
self.read_pointer_line(line)
|
||||
|
||||
elif self.mode == "LF_ENUM":
|
||||
self.read_enum_line(line)
|
||||
|
||||
elif self.mode == "LF_UNION":
|
||||
self.read_union_line(line)
|
||||
|
||||
else:
|
||||
# Check for exhaustiveness
|
||||
logger.error("Unhandled data in mode: %s", self.mode)
|
||||
|
||||
def read_fieldlist_line(self, line: str):
|
||||
# If this class has a vtable, create a mock member at offset 0
|
||||
if (match := self.VTABLE_RE.match(line)) is not None:
|
||||
# For our purposes, any pointer type will do
|
||||
self._add_member(0, "T_32PVOID")
|
||||
self._set_member_name("vftable")
|
||||
|
||||
# Superclass is set here in the fieldlist rather than in LF_CLASS
|
||||
elif (match := self.SUPERCLASS_RE.match(line)) is not None:
|
||||
superclass_list: dict[str, int] = self.keys[self.last_key].setdefault(
|
||||
"super", {}
|
||||
)
|
||||
superclass_list[normalize_type_id(match.group("type"))] = int(
|
||||
match.group("offset")
|
||||
)
|
||||
|
||||
# virtual base class (direct or indirect)
|
||||
elif (match := self.VBCLASS_RE.match(line)) is not None:
|
||||
virtual_base_pointer = self.keys[self.last_key].setdefault(
|
||||
"vbase",
|
||||
VirtualBasePointer(
|
||||
vboffset=-1, # default to -1 until we parse the correct value
|
||||
bases=[],
|
||||
),
|
||||
)
|
||||
assert isinstance(
|
||||
virtual_base_pointer, VirtualBasePointer
|
||||
) # type checker only
|
||||
|
||||
virtual_base_pointer.bases.append(
|
||||
VirtualBaseClass(
|
||||
type=match.group("type"),
|
||||
index=-1, # default to -1 until we parse the correct value
|
||||
direct=match.group("indirect") != "I",
|
||||
)
|
||||
)
|
||||
|
||||
elif (match := self.VBCLASS_LINE_2_RE.match(line)) is not None:
|
||||
virtual_base_pointer = self.keys[self.last_key].get("vbase", None)
|
||||
assert isinstance(
|
||||
virtual_base_pointer, VirtualBasePointer
|
||||
), "Parsed the second line of an (I)VBCLASS without the first one"
|
||||
vboffset = int(match.group("vboffset"))
|
||||
|
||||
if virtual_base_pointer.vboffset == -1:
|
||||
# default value
|
||||
virtual_base_pointer.vboffset = vboffset
|
||||
elif virtual_base_pointer.vboffset != vboffset:
|
||||
# vboffset is always equal to 4 in our examples. We are not sure if there can be multiple
|
||||
# virtual base pointers, and if so, how the layout is supposed to look.
|
||||
# We therefore assume that there is always only one virtual base pointer.
|
||||
logger.error(
|
||||
"Unhandled: Found multiple virtual base pointers at offsets %d and %d",
|
||||
virtual_base_pointer.vboffset,
|
||||
vboffset,
|
||||
)
|
||||
|
||||
virtual_base_pointer.bases[-1].index = int(match.group("vbindex"))
|
||||
# these come out of order, and the lists are so short that it's fine to sort them every time
|
||||
virtual_base_pointer.bases.sort(key=lambda x: x.index)
|
||||
|
||||
# Member offset and type given on the first of two lines.
|
||||
elif (match := self.LIST_RE.match(line)) is not None:
|
||||
self._add_member(
|
||||
int(match.group("offset")), normalize_type_id(match.group("type"))
|
||||
)
|
||||
|
||||
# Name of the member read on the second of two lines.
|
||||
elif (match := self.MEMBER_RE.match(line)) is not None:
|
||||
self._set_member_name(match.group("name"))
|
||||
|
||||
elif (match := self.LF_FIELDLIST_ENUMERATE.match(line)) is not None:
|
||||
self._add_variant(match.group("name"), int(match.group("value")))
|
||||
|
||||
def read_class_or_struct_line(self, line: str):
|
||||
# Match the reference to the associated LF_FIELDLIST
|
||||
if (match := self.CLASS_FIELD_RE.match(line)) is not None:
|
||||
if match.group("field_type") == "0x0000":
|
||||
# Not redundant. UDT might not match the key.
|
||||
# These cases get reported as UDT mismatch.
|
||||
self._set("is_forward_ref", True)
|
||||
else:
|
||||
field_list_type = normalize_type_id(match.group("field_type"))
|
||||
self._set("field_list_type", field_list_type)
|
||||
|
||||
elif line.lstrip().startswith("Derivation list type"):
|
||||
# We do not care about the second line, but we still match it so we see an error
|
||||
# when another line fails to match
|
||||
pass
|
||||
elif (match := self.CLASS_NAME_RE.match(line)) is not None:
|
||||
# Last line has the vital information.
|
||||
# If this is a FORWARD REF, we need to follow the UDT pointer
|
||||
# to get the actual class details.
|
||||
self._set("name", match.group("name"))
|
||||
udt = match.group("udt")
|
||||
if udt is not None:
|
||||
self._set("udt", normalize_type_id(udt))
|
||||
self._set("size", int(match.group("size")))
|
||||
else:
|
||||
logger.error("Unmatched line in class: %s", line[:-1])
|
||||
|
||||
def read_arglist_line(self, line: str):
|
||||
if (match := self.LF_ARGLIST_ENTRY.match(line)) is not None:
|
||||
obj = self.keys[self.last_key]
|
||||
arglist: list = obj.setdefault("args", [])
|
||||
assert int(match.group("index")) == len(
|
||||
arglist
|
||||
), "Argument list out of sync"
|
||||
arglist.append(match.group("arg_type"))
|
||||
else:
|
||||
logger.error("Unmatched line in arglist: %s", line[:-1])
|
||||
|
||||
def read_pointer_line(self, line: str):
|
||||
if (match := self.LF_POINTER_ELEMENT.match(line)) is not None:
|
||||
self._set("element_type", match.group("element_type"))
|
||||
else:
|
||||
stripped_line = line.strip()
|
||||
# We don't parse these lines, but we still want to check for exhaustiveness
|
||||
# in case we missed some relevant data
|
||||
if not any(
|
||||
stripped_line.startswith(prefix)
|
||||
for prefix in ["Pointer", "const Pointer", "L-value", "volatile"]
|
||||
):
|
||||
logger.error("Unrecognized pointer attribute: %s", line[:-1])
|
||||
|
||||
def read_mfunction_line(self, line: str):
|
||||
"""
|
||||
The layout is not consistent, so we want to be as robust as possible here.
|
||||
- Example 1:
|
||||
Return type = T_LONG(0012), Call type = C Near
|
||||
Func attr = none
|
||||
- Example 2:
|
||||
Return type = T_CHAR(0010), Class type = 0x101A, This type = 0x101B,
|
||||
Call type = ThisCall, Func attr = none
|
||||
"""
|
||||
|
||||
obj = self.keys[self.last_key]
|
||||
|
||||
key_value_pairs = line.split(",")
|
||||
for pair in key_value_pairs:
|
||||
if pair.isspace():
|
||||
continue
|
||||
obj |= self.parse_function_attribute(pair)
|
||||
|
||||
def parse_function_attribute(self, pair: str) -> dict[str, str]:
|
||||
for attribute_regex in self.LF_MFUNCTION_ATTRIBUTES:
|
||||
if (match := attribute_regex.match(pair)) is not None:
|
||||
return match.groupdict()
|
||||
logger.error("Unknown attribute in function: %s", pair)
|
||||
return {}
|
||||
|
||||
def read_enum_line(self, line: str):
|
||||
obj = self.keys[self.last_key]
|
||||
|
||||
# We need special comma handling because commas may appear in the name.
|
||||
# Splitting by "," yields the wrong result.
|
||||
enum_attributes = line.split(", ")
|
||||
for pair in enum_attributes:
|
||||
if pair.endswith(","):
|
||||
pair = pair[:-1]
|
||||
if pair.isspace():
|
||||
continue
|
||||
obj |= self.parse_enum_attribute(pair)
|
||||
|
||||
def parse_enum_attribute(self, attribute: str) -> dict[str, Any]:
|
||||
for attribute_regex in self.LF_ENUM_ATTRIBUTES:
|
||||
if (match := attribute_regex.match(attribute)) is not None:
|
||||
return match.groupdict()
|
||||
if attribute == "NESTED":
|
||||
return {"is_nested": True}
|
||||
if attribute == "FORWARD REF":
|
||||
return {"is_forward_ref": True}
|
||||
if attribute.startswith("UDT"):
|
||||
match = self.LF_ENUM_UDT.match(attribute)
|
||||
assert match is not None
|
||||
return {"udt": normalize_type_id(match.group("udt"))}
|
||||
if (match := self.LF_ENUM_TYPES.match(attribute)) is not None:
|
||||
result = match.groupdict()
|
||||
result["underlying_type"] = normalize_type_id(result["underlying_type"])
|
||||
return result
|
||||
logger.error("Unknown attribute in enum: %s", attribute)
|
||||
return {}
|
||||
|
||||
def read_union_line(self, line: str):
|
||||
"""This is a rather barebones handler, only parsing the size"""
|
||||
if (match := self.LF_UNION_LINE.match(line)) is None:
|
||||
raise AssertionError(f"Unhandled in union: {line}")
|
||||
self._set("name", match.group("name"))
|
||||
if match.group("field_type") == "0x0000":
|
||||
self._set("is_forward_ref", True)
|
||||
|
||||
self._set("size", int(match.group("size")))
|
||||
if match.group("udt") is not None:
|
||||
self._set("udt", normalize_type_id(match.group("udt")))
|
@ -1,103 +0,0 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import pathlib
|
||||
from typing import Iterator
|
||||
|
||||
|
||||
def winepath_win_to_unix(path: str) -> str:
|
||||
return subprocess.check_output(["winepath", path], text=True).strip()
|
||||
|
||||
|
||||
def winepath_unix_to_win(path: str) -> str:
|
||||
return subprocess.check_output(["winepath", "-w", path], text=True).strip()
|
||||
|
||||
|
||||
class PathResolver:
|
||||
"""Intended to resolve Windows/Wine paths used in the PDB (cvdump) output
|
||||
into a "canonical" format to be matched against code file paths from os.walk.
|
||||
MSVC may include files from the parent dir using `..`. We eliminate those and create
|
||||
an absolute path so that information about the same file under different names
|
||||
will be combined into the same record. (i.e. line_no/addr pairs from LINES section.)
|
||||
"""
|
||||
|
||||
def __init__(self, basedir) -> None:
|
||||
"""basedir is the root path of the code directory in the format for your OS.
|
||||
We will convert it to a PureWindowsPath to be platform-independent
|
||||
and match that to the paths from the PDB."""
|
||||
|
||||
# Memoize the converted paths. We will need to do this for each path
|
||||
# in the PDB, for each function in that file. (i.e. lots of repeated work)
|
||||
self._memo = {}
|
||||
|
||||
# Convert basedir to an absolute path if it is not already.
|
||||
# If it is not absolute, we cannot do the path swap on unix.
|
||||
self._realdir = pathlib.Path(basedir).resolve()
|
||||
|
||||
self._is_unix = os.name != "nt"
|
||||
if self._is_unix:
|
||||
self._basedir = pathlib.PureWindowsPath(
|
||||
winepath_unix_to_win(str(self._realdir))
|
||||
)
|
||||
else:
|
||||
self._basedir = self._realdir
|
||||
|
||||
def _memo_wrapper(self, path_str: str) -> str:
|
||||
"""Wrapper so we can memoize from the public caller method"""
|
||||
path = pathlib.PureWindowsPath(path_str)
|
||||
if not path.is_absolute():
|
||||
# pathlib syntactic sugar for path concat
|
||||
path = self._basedir / path
|
||||
|
||||
if self._is_unix:
|
||||
# If the given path is relative to the basedir, deconstruct the path
|
||||
# and swap in our unix path to avoid an expensive call to winepath.
|
||||
try:
|
||||
# Will raise ValueError if we are not relative to the base.
|
||||
section = path.relative_to(self._basedir)
|
||||
# Should combine to pathlib.PosixPath
|
||||
mockpath = (self._realdir / section).resolve()
|
||||
if mockpath.is_file():
|
||||
return str(mockpath)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# We are not relative to the basedir, or our path swap attempt
|
||||
# did not point at an actual file. Either way, we are forced
|
||||
# to call winepath using our original path.
|
||||
return winepath_win_to_unix(str(path))
|
||||
|
||||
# We must be on Windows. Convert back to WindowsPath.
|
||||
# The resolve() call will eliminate intermediate backdir references.
|
||||
return str(pathlib.Path(path).resolve())
|
||||
|
||||
def resolve_cvdump(self, path_str: str) -> str:
|
||||
"""path_str is in Windows/Wine path format.
|
||||
We will return a path in the format for the host OS."""
|
||||
if path_str not in self._memo:
|
||||
self._memo[path_str] = self._memo_wrapper(path_str)
|
||||
|
||||
return self._memo[path_str]
|
||||
|
||||
|
||||
def is_file_cpp(filename: str) -> bool:
|
||||
(_, ext) = os.path.splitext(filename)
|
||||
return ext.lower() in (".h", ".cpp")
|
||||
|
||||
|
||||
def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
|
||||
"""Generator to walk the given directory recursively and return
|
||||
any C++ files found."""
|
||||
|
||||
source = os.path.abspath(source)
|
||||
for subdir, _, files in os.walk(source):
|
||||
for file in files:
|
||||
if is_file_cpp(file):
|
||||
yield os.path.join(subdir, file)
|
||||
|
||||
if not recursive:
|
||||
break
|
||||
|
||||
|
||||
def get_file_in_script_dir(fn):
|
||||
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
|
@ -1,13 +0,0 @@
|
||||
"""Provides a reference point for redistributed tools found in this directory.
|
||||
This allows you to get the path for these tools from a script run anywhere."""
|
||||
from os.path import join, dirname
|
||||
|
||||
|
||||
def lib_path() -> str:
|
||||
"""Returns the directory for this module."""
|
||||
return dirname(__file__)
|
||||
|
||||
|
||||
def lib_path_join(name: str) -> str:
|
||||
"""Convenience wrapper for os.path.join."""
|
||||
return join(lib_path(), name)
|
@ -1,3 +0,0 @@
|
||||
from .codebase import DecompCodebase
|
||||
from .parser import DecompParser
|
||||
from .linter import DecompLinter
|
@ -1,57 +0,0 @@
|
||||
"""For aggregating decomp markers read from an entire directory and for a single module."""
|
||||
from typing import Callable, Iterable, Iterator, List
|
||||
from .parser import DecompParser
|
||||
from .node import (
|
||||
ParserSymbol,
|
||||
ParserFunction,
|
||||
ParserVtable,
|
||||
ParserVariable,
|
||||
ParserString,
|
||||
)
|
||||
|
||||
|
||||
class DecompCodebase:
|
||||
def __init__(self, filenames: Iterable[str], module: str) -> None:
|
||||
self._symbols: List[ParserSymbol] = []
|
||||
|
||||
parser = DecompParser()
|
||||
for filename in filenames:
|
||||
parser.reset()
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
for sym in parser.iter_symbols(module):
|
||||
sym.filename = filename
|
||||
self._symbols.append(sym)
|
||||
|
||||
def prune_invalid_addrs(self, is_valid: Callable[int, bool]) -> List[ParserSymbol]:
|
||||
"""Some decomp annotations might have an invalid address.
|
||||
Return the list of addresses where we fail the is_valid check,
|
||||
and remove those from our list of symbols."""
|
||||
invalid_symbols = [sym for sym in self._symbols if not is_valid(sym.offset)]
|
||||
self._symbols = [sym for sym in self._symbols if is_valid(sym.offset)]
|
||||
|
||||
return invalid_symbols
|
||||
|
||||
def iter_line_functions(self) -> Iterator[ParserFunction]:
|
||||
"""Return lineref functions separately from nameref. Assuming the PDB matches
|
||||
the state of the source code, a line reference is a guaranteed match, even if
|
||||
multiple functions share the same name. (i.e. polymorphism)"""
|
||||
return filter(
|
||||
lambda s: isinstance(s, ParserFunction) and not s.is_nameref(),
|
||||
self._symbols,
|
||||
)
|
||||
|
||||
def iter_name_functions(self) -> Iterator[ParserFunction]:
|
||||
return filter(
|
||||
lambda s: isinstance(s, ParserFunction) and s.is_nameref(), self._symbols
|
||||
)
|
||||
|
||||
def iter_vtables(self) -> Iterator[ParserVtable]:
|
||||
return filter(lambda s: isinstance(s, ParserVtable), self._symbols)
|
||||
|
||||
def iter_variables(self) -> Iterator[ParserVariable]:
|
||||
return filter(lambda s: isinstance(s, ParserVariable), self._symbols)
|
||||
|
||||
def iter_strings(self) -> Iterator[ParserString]:
|
||||
return filter(lambda s: isinstance(s, ParserString), self._symbols)
|
@ -1,97 +0,0 @@
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
# TODO: poorly chosen name, should be AlertType or AlertCode or something
|
||||
class ParserError(Enum):
|
||||
# WARN: Stub function exceeds some line number threshold
|
||||
UNLIKELY_STUB = 100
|
||||
|
||||
# WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly
|
||||
BAD_DECOMP_MARKER = 101
|
||||
|
||||
# WARN: Multiple markers in sequence do not have distinct modules
|
||||
DUPLICATE_MODULE = 102
|
||||
|
||||
# WARN: Detected a dupcliate module/offset pair in the current file
|
||||
DUPLICATE_OFFSET = 103
|
||||
|
||||
# WARN: We read a line that matches the decomp marker pattern, but we are not set up
|
||||
# to handle it
|
||||
BOGUS_MARKER = 104
|
||||
|
||||
# WARN: New function marker appeared while we were inside a function
|
||||
MISSED_END_OF_FUNCTION = 105
|
||||
|
||||
# WARN: If we find a curly brace right after the function declaration
|
||||
# this is wrong but we still have enough to make a match with reccmp
|
||||
MISSED_START_OF_FUNCTION = 106
|
||||
|
||||
# WARN: A blank line appeared between the end of FUNCTION markers
|
||||
# and the start of the function. We can ignore it, but the line shouldn't be there
|
||||
UNEXPECTED_BLANK_LINE = 107
|
||||
|
||||
# WARN: We called the finish() method for the parser but had not reached the starting
|
||||
# state of SEARCH
|
||||
UNEXPECTED_END_OF_FILE = 108
|
||||
|
||||
# WARN: We found a marker to be referenced by name outside of a header file.
|
||||
BYNAME_FUNCTION_IN_CPP = 109
|
||||
|
||||
# WARN: A GLOBAL marker appeared over a variable without the g_ prefix
|
||||
GLOBAL_MISSING_PREFIX = 110
|
||||
|
||||
# WARN: GLOBAL marker points at something other than variable declaration.
|
||||
# We can't match global variables based on position, but the goal here is
|
||||
# to ignore things like string literal that are not variables.
|
||||
GLOBAL_NOT_VARIABLE = 111
|
||||
|
||||
# WARN: A marked static variable inside a function needs to have its
|
||||
# function marked too, and in the same module.
|
||||
ORPHANED_STATIC_VARIABLE = 112
|
||||
|
||||
# This code or higher is an error, not a warning
|
||||
DECOMP_ERROR_START = 200
|
||||
|
||||
# ERROR: We found a marker unexpectedly
|
||||
UNEXPECTED_MARKER = 200
|
||||
|
||||
# ERROR: We found a marker where we expected to find one, but it is incompatible
|
||||
# with the preceding markers.
|
||||
# For example, a GLOBAL cannot follow FUNCTION/STUB
|
||||
INCOMPATIBLE_MARKER = 201
|
||||
|
||||
# ERROR: The line following an explicit by-name marker was not a comment
|
||||
# We assume a syntax error here rather than try to use the next line
|
||||
BAD_NAMEREF = 202
|
||||
|
||||
# ERROR: This function offset comes before the previous offset from the same module
|
||||
# This hopefully gives some hint about which functions need to be rearranged.
|
||||
FUNCTION_OUT_OF_ORDER = 203
|
||||
|
||||
# ERROR: The line following an explicit by-name marker that does _not_ expect
|
||||
# a comment -- i.e. VTABLE or GLOBAL -- could not extract the name
|
||||
NO_SUITABLE_NAME = 204
|
||||
|
||||
# ERROR: Two STRING markers have the same module and offset, but the strings
|
||||
# they annotate are different.
|
||||
WRONG_STRING = 205
|
||||
|
||||
# ERROR: This lineref FUNCTION marker is next to a function declaration or
|
||||
# forward reference. The correct place for the marker is where the function
|
||||
# is implemented so we can match with the PDB.
|
||||
NO_IMPLEMENTATION = 206
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserAlert:
|
||||
code: ParserError
|
||||
line_number: int
|
||||
line: Optional[str] = None
|
||||
|
||||
def is_warning(self) -> bool:
|
||||
return self.code.value < ParserError.DECOMP_ERROR_START.value
|
||||
|
||||
def is_error(self) -> bool:
|
||||
return self.code.value >= ParserError.DECOMP_ERROR_START.value
|
@ -1,144 +0,0 @@
|
||||
from typing import List, Optional
|
||||
from .parser import DecompParser
|
||||
from .error import ParserAlert, ParserError
|
||||
from .node import ParserSymbol, ParserString
|
||||
|
||||
|
||||
def get_checkorder_filter(module):
|
||||
"""Return a filter function on implemented functions in the given module"""
|
||||
return lambda fun: fun.module == module and not fun.lookup_by_name
|
||||
|
||||
|
||||
class DecompLinter:
|
||||
def __init__(self) -> None:
|
||||
self.alerts: List[ParserAlert] = []
|
||||
self._parser = DecompParser()
|
||||
self._filename: str = ""
|
||||
self._module: Optional[str] = None
|
||||
# Set of (str, int) tuples for each module/offset pair seen while scanning.
|
||||
# This is _not_ reset between files and is intended to report offset reuse
|
||||
# when scanning the entire directory.
|
||||
self._offsets_used = set()
|
||||
# Keep track of strings we have seen. Persists across files.
|
||||
# Module/offset can be repeated for string markers but the strings must match.
|
||||
self._strings = {}
|
||||
|
||||
def reset(self, full_reset: bool = False):
|
||||
self.alerts = []
|
||||
self._parser.reset()
|
||||
self._filename = ""
|
||||
self._module = None
|
||||
|
||||
if full_reset:
|
||||
self._offsets_used.clear()
|
||||
self._strings = {}
|
||||
|
||||
def file_is_header(self):
|
||||
return self._filename.lower().endswith(".h")
|
||||
|
||||
def _load_offsets_from_list(self, marker_list: List[ParserSymbol]):
|
||||
"""Helper for loading (module, offset) tuples while the DecompParser
|
||||
has them broken up into three different lists."""
|
||||
for marker in marker_list:
|
||||
is_string = isinstance(marker, ParserString)
|
||||
|
||||
value = (marker.module, marker.offset)
|
||||
if value in self._offsets_used:
|
||||
if is_string:
|
||||
if self._strings[value] != marker.name:
|
||||
self.alerts.append(
|
||||
ParserAlert(
|
||||
code=ParserError.WRONG_STRING,
|
||||
line_number=marker.line_number,
|
||||
line=f"0x{marker.offset:08x}, {repr(self._strings[value])} vs. {repr(marker.name)}",
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.alerts.append(
|
||||
ParserAlert(
|
||||
code=ParserError.DUPLICATE_OFFSET,
|
||||
line_number=marker.line_number,
|
||||
line=f"0x{marker.offset:08x}",
|
||||
)
|
||||
)
|
||||
else:
|
||||
self._offsets_used.add(value)
|
||||
if is_string:
|
||||
self._strings[value] = marker.name
|
||||
|
||||
def _check_function_order(self):
|
||||
"""Rules:
|
||||
1. Only markers that are implemented in the file are considered. This means we
|
||||
only look at markers that are cross-referenced with cvdump output by their line
|
||||
number. Markers with the lookup_by_name flag set are ignored because we cannot
|
||||
directly influence their order.
|
||||
|
||||
2. Order should be considered for a single module only. If we have multiple
|
||||
markers for a single function (i.e. for LEGO1 functions linked statically to
|
||||
ISLE) then the virtual address space will be very different. If we don't check
|
||||
for one module only, we would incorrectly report that the file is out of order.
|
||||
"""
|
||||
|
||||
if self._module is None:
|
||||
return
|
||||
|
||||
checkorder_filter = get_checkorder_filter(self._module)
|
||||
last_offset = None
|
||||
for fun in filter(checkorder_filter, self._parser.functions):
|
||||
if last_offset is not None:
|
||||
if fun.offset < last_offset:
|
||||
self.alerts.append(
|
||||
ParserAlert(
|
||||
code=ParserError.FUNCTION_OUT_OF_ORDER,
|
||||
line_number=fun.line_number,
|
||||
)
|
||||
)
|
||||
|
||||
last_offset = fun.offset
|
||||
|
||||
def _check_offset_uniqueness(self):
|
||||
self._load_offsets_from_list(self._parser.functions)
|
||||
self._load_offsets_from_list(self._parser.vtables)
|
||||
self._load_offsets_from_list(self._parser.variables)
|
||||
self._load_offsets_from_list(self._parser.strings)
|
||||
|
||||
def _check_byname_allowed(self):
|
||||
if self.file_is_header():
|
||||
return
|
||||
|
||||
for fun in self._parser.functions:
|
||||
if fun.lookup_by_name:
|
||||
self.alerts.append(
|
||||
ParserAlert(
|
||||
code=ParserError.BYNAME_FUNCTION_IN_CPP,
|
||||
line_number=fun.line_number,
|
||||
)
|
||||
)
|
||||
|
||||
def check_lines(self, lines, filename, module=None):
|
||||
"""`lines` is a generic iterable to allow for testing with a list of strings.
|
||||
We assume lines has the entire contents of the compilation unit."""
|
||||
|
||||
self.reset(False)
|
||||
self._filename = filename
|
||||
self._module = module
|
||||
|
||||
self._parser.read_lines(lines)
|
||||
|
||||
self._parser.finish()
|
||||
self.alerts = self._parser.alerts[::]
|
||||
|
||||
self._check_offset_uniqueness()
|
||||
|
||||
if self._module is not None:
|
||||
self._check_byname_allowed()
|
||||
|
||||
if not self.file_is_header():
|
||||
self._check_function_order()
|
||||
|
||||
return len(self.alerts) == 0
|
||||
|
||||
def check_file(self, filename, module=None):
|
||||
"""Convenience method for decomplint cli tool"""
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
return self.check_lines(f, filename, module)
|
@ -1,146 +0,0 @@
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class MarkerCategory(Enum):
|
||||
"""For the purposes of grouping multiple different DecompMarkers together,
|
||||
assign a rough "category" for the MarkerType values below.
|
||||
It's really only the function types that have to get folded down, but
|
||||
we'll do that in a structured way to permit future expansion."""
|
||||
|
||||
FUNCTION = 1
|
||||
VARIABLE = 2
|
||||
STRING = 3
|
||||
VTABLE = 4
|
||||
ADDRESS = 100 # i.e. no comparison required or possible
|
||||
|
||||
|
||||
class MarkerType(Enum):
|
||||
UNKNOWN = -100
|
||||
FUNCTION = 1
|
||||
STUB = 2
|
||||
SYNTHETIC = 3
|
||||
TEMPLATE = 4
|
||||
GLOBAL = 5
|
||||
VTABLE = 6
|
||||
STRING = 7
|
||||
LIBRARY = 8
|
||||
|
||||
|
||||
markerRegex = re.compile(
|
||||
r"\s*//\s*(?P<type>\w+):\s*(?P<module>\w+)\s+(?P<offset>0x[a-f0-9]+) *(?P<extra>\S.+\S)?",
|
||||
flags=re.I,
|
||||
)
|
||||
|
||||
|
||||
markerExactRegex = re.compile(
|
||||
r"\s*// (?P<type>[A-Z]+): (?P<module>[A-Z0-9]+) (?P<offset>0x[a-f0-9]+)(?: (?P<extra>\S.+\S))?\n?$"
|
||||
)
|
||||
|
||||
|
||||
class DecompMarker:
|
||||
def __init__(
|
||||
self, marker_type: str, module: str, offset: int, extra: Optional[str] = None
|
||||
) -> None:
|
||||
try:
|
||||
self._type = MarkerType[marker_type.upper()]
|
||||
except KeyError:
|
||||
self._type = MarkerType.UNKNOWN
|
||||
|
||||
# Convert to upper here. A lot of other analysis depends on this name
|
||||
# being consistent and predictable. If the name is _not_ capitalized
|
||||
# we will emit a syntax error.
|
||||
self._module: str = module.upper()
|
||||
self._offset: int = offset
|
||||
self._extra: Optional[str] = extra
|
||||
|
||||
@property
|
||||
def type(self) -> MarkerType:
|
||||
return self._type
|
||||
|
||||
@property
|
||||
def module(self) -> str:
|
||||
return self._module
|
||||
|
||||
@property
|
||||
def offset(self) -> int:
|
||||
return self._offset
|
||||
|
||||
@property
|
||||
def extra(self) -> Optional[str]:
|
||||
return self._extra
|
||||
|
||||
@property
|
||||
def category(self) -> MarkerCategory:
|
||||
if self.is_vtable():
|
||||
return MarkerCategory.VTABLE
|
||||
|
||||
if self.is_variable():
|
||||
return MarkerCategory.VARIABLE
|
||||
|
||||
if self.is_string():
|
||||
return MarkerCategory.STRING
|
||||
|
||||
# TODO: worth another look if we add more types, but this covers it
|
||||
if self.is_regular_function() or self.is_explicit_byname():
|
||||
return MarkerCategory.FUNCTION
|
||||
|
||||
return MarkerCategory.ADDRESS
|
||||
|
||||
@property
|
||||
def key(self) -> Tuple[str, str, Optional[str]]:
|
||||
"""For use with the MarkerDict. To detect/avoid marker collision."""
|
||||
return (self.category, self.module, self.extra)
|
||||
|
||||
def is_regular_function(self) -> bool:
|
||||
"""Regular function, meaning: not an explicit byname lookup. FUNCTION
|
||||
markers can be _implicit_ byname.
|
||||
FUNCTION and STUB markers are (currently) the only heterogenous marker types that
|
||||
can be lumped together, although the reasons for doing so are a little vague."""
|
||||
return self._type in (MarkerType.FUNCTION, MarkerType.STUB)
|
||||
|
||||
def is_explicit_byname(self) -> bool:
|
||||
return self._type in (
|
||||
MarkerType.SYNTHETIC,
|
||||
MarkerType.TEMPLATE,
|
||||
MarkerType.LIBRARY,
|
||||
)
|
||||
|
||||
def is_variable(self) -> bool:
|
||||
return self._type == MarkerType.GLOBAL
|
||||
|
||||
def is_synthetic(self) -> bool:
|
||||
return self._type == MarkerType.SYNTHETIC
|
||||
|
||||
def is_template(self) -> bool:
|
||||
return self._type == MarkerType.TEMPLATE
|
||||
|
||||
def is_vtable(self) -> bool:
|
||||
return self._type == MarkerType.VTABLE
|
||||
|
||||
def is_library(self) -> bool:
|
||||
return self._type == MarkerType.LIBRARY
|
||||
|
||||
def is_string(self) -> bool:
|
||||
return self._type == MarkerType.STRING
|
||||
|
||||
def allowed_in_func(self) -> bool:
|
||||
return self._type in (MarkerType.GLOBAL, MarkerType.STRING)
|
||||
|
||||
|
||||
def match_marker(line: str) -> Optional[DecompMarker]:
|
||||
match = markerRegex.match(line)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return DecompMarker(
|
||||
marker_type=match.group("type"),
|
||||
module=match.group("module"),
|
||||
offset=int(match.group("offset"), 16),
|
||||
extra=match.group("extra"),
|
||||
)
|
||||
|
||||
|
||||
def is_marker_exact(line: str) -> bool:
|
||||
return markerExactRegex.match(line) is not None
|
@ -1,63 +0,0 @@
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
from .marker import MarkerType
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserSymbol:
|
||||
"""Exported decomp marker with all information (except the code filename) required to
|
||||
cross-reference with cvdump data."""
|
||||
|
||||
type: MarkerType
|
||||
line_number: int
|
||||
module: str
|
||||
offset: int
|
||||
name: str
|
||||
|
||||
# The parser doesn't (currently) know about the code filename, but if you
|
||||
# wanted to set it here after the fact, here's the spot.
|
||||
filename: Optional[str] = None
|
||||
|
||||
def should_skip(self) -> bool:
|
||||
"""The default is to compare any symbols we have"""
|
||||
return False
|
||||
|
||||
def is_nameref(self) -> bool:
|
||||
"""All symbols default to name lookup"""
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserFunction(ParserSymbol):
|
||||
# We are able to detect the closing line of a function with some reliability.
|
||||
# This isn't used for anything right now, but perhaps later it will be.
|
||||
end_line: Optional[int] = None
|
||||
|
||||
# All marker types are referenced by name except FUNCTION/STUB. These can also be
|
||||
# referenced by name, but only if this flag is true.
|
||||
lookup_by_name: bool = False
|
||||
|
||||
def should_skip(self) -> bool:
|
||||
return self.type == MarkerType.STUB
|
||||
|
||||
def is_nameref(self) -> bool:
|
||||
return (
|
||||
self.type in (MarkerType.SYNTHETIC, MarkerType.TEMPLATE, MarkerType.LIBRARY)
|
||||
or self.lookup_by_name
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserVariable(ParserSymbol):
|
||||
is_static: bool = False
|
||||
parent_function: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserVtable(ParserSymbol):
|
||||
base_class: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserString(ParserSymbol):
|
||||
pass
|
@ -1,556 +0,0 @@
|
||||
# C++ file parser
|
||||
|
||||
from typing import List, Iterable, Iterator, Optional
|
||||
from enum import Enum
|
||||
from .util import (
|
||||
get_class_name,
|
||||
get_variable_name,
|
||||
get_synthetic_name,
|
||||
remove_trailing_comment,
|
||||
get_string_contents,
|
||||
sanitize_code_line,
|
||||
scopeDetectRegex,
|
||||
)
|
||||
from .marker import (
|
||||
DecompMarker,
|
||||
MarkerCategory,
|
||||
match_marker,
|
||||
is_marker_exact,
|
||||
)
|
||||
from .node import (
|
||||
ParserSymbol,
|
||||
ParserFunction,
|
||||
ParserVariable,
|
||||
ParserVtable,
|
||||
ParserString,
|
||||
)
|
||||
from .error import ParserAlert, ParserError
|
||||
|
||||
|
||||
class ReaderState(Enum):
|
||||
SEARCH = 0
|
||||
WANT_SIG = 1
|
||||
IN_FUNC = 2
|
||||
IN_TEMPLATE = 3
|
||||
WANT_CURLY = 4
|
||||
IN_GLOBAL = 5
|
||||
IN_FUNC_GLOBAL = 6
|
||||
IN_VTABLE = 7
|
||||
IN_SYNTHETIC = 8
|
||||
IN_LIBRARY = 9
|
||||
DONE = 100
|
||||
|
||||
|
||||
class MarkerDict:
|
||||
def __init__(self) -> None:
|
||||
self.markers: dict = {}
|
||||
|
||||
def insert(self, marker: DecompMarker) -> bool:
|
||||
"""Return True if this insert would overwrite"""
|
||||
if marker.key in self.markers:
|
||||
return True
|
||||
|
||||
self.markers[marker.key] = marker
|
||||
return False
|
||||
|
||||
def query(
|
||||
self, category: MarkerCategory, module: str, extra: Optional[str] = None
|
||||
) -> Optional[DecompMarker]:
|
||||
return self.markers.get((category, module, extra))
|
||||
|
||||
def iter(self) -> Iterator[DecompMarker]:
|
||||
for _, marker in self.markers.items():
|
||||
yield marker
|
||||
|
||||
def empty(self):
|
||||
self.markers = {}
|
||||
|
||||
|
||||
class CurlyManager:
|
||||
"""Overly simplified scope manager"""
|
||||
|
||||
def __init__(self):
|
||||
self._stack = []
|
||||
|
||||
def reset(self):
|
||||
self._stack = []
|
||||
|
||||
def _pop(self):
|
||||
"""Pop stack safely"""
|
||||
try:
|
||||
self._stack.pop()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
def get_prefix(self, name: Optional[str] = None) -> str:
|
||||
"""Return the prefix for where we are."""
|
||||
|
||||
scopes = [t for t in self._stack if t != "{"]
|
||||
if len(scopes) == 0:
|
||||
return name if name is not None else ""
|
||||
|
||||
if name is not None and name not in scopes:
|
||||
scopes.append(name)
|
||||
|
||||
return "::".join(scopes)
|
||||
|
||||
def read_line(self, raw_line: str):
|
||||
"""Read a line of code and update the stack."""
|
||||
line = sanitize_code_line(raw_line)
|
||||
if (match := scopeDetectRegex.match(line)) is not None:
|
||||
if not line.endswith(";"):
|
||||
self._stack.append(match.group("name"))
|
||||
|
||||
change = line.count("{") - line.count("}")
|
||||
if change > 0:
|
||||
for _ in range(change):
|
||||
self._stack.append("{")
|
||||
elif change < 0:
|
||||
for _ in range(-change):
|
||||
self._pop()
|
||||
|
||||
if len(self._stack) == 0:
|
||||
return
|
||||
|
||||
last = self._stack[-1]
|
||||
if last != "{":
|
||||
self._pop()
|
||||
|
||||
|
||||
class DecompParser:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
# Could combine output lists into a single list to get under the limit,
|
||||
# but not right now
|
||||
def __init__(self) -> None:
|
||||
# The lists to be populated as we parse
|
||||
self._symbols: List[ParserSymbol] = []
|
||||
self.alerts: List[ParserAlert] = []
|
||||
|
||||
self.line_number: int = 0
|
||||
self.state: ReaderState = ReaderState.SEARCH
|
||||
|
||||
self.last_line: str = ""
|
||||
|
||||
self.curly = CurlyManager()
|
||||
|
||||
# To allow for multiple markers where code is shared across different
|
||||
# modules, save lists of compatible markers that appear in sequence
|
||||
self.fun_markers = MarkerDict()
|
||||
self.var_markers = MarkerDict()
|
||||
self.tbl_markers = MarkerDict()
|
||||
|
||||
# To handle functions that are entirely indented (i.e. those defined
|
||||
# in class declarations), remember how many whitespace characters
|
||||
# came before the opening curly brace and match that up at the end.
|
||||
# This should give us the same or better accuracy for a well-formed file.
|
||||
# The alternative is counting the curly braces on each line
|
||||
# but that's probably too cumbersome.
|
||||
self.curly_indent_stops: int = 0
|
||||
|
||||
# For non-synthetic functions, save the line number where the function begins
|
||||
# (i.e. where we see the curly brace) along with the function signature.
|
||||
# We will need both when we reach the end of the function.
|
||||
self.function_start: int = 0
|
||||
self.function_sig: str = ""
|
||||
|
||||
def reset(self):
|
||||
self._symbols = []
|
||||
self.alerts = []
|
||||
|
||||
self.line_number = 0
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
self.last_line = ""
|
||||
|
||||
self.fun_markers.empty()
|
||||
self.var_markers.empty()
|
||||
self.tbl_markers.empty()
|
||||
|
||||
self.curly_indent_stops = 0
|
||||
self.function_start = 0
|
||||
self.function_sig = ""
|
||||
|
||||
self.curly.reset()
|
||||
|
||||
@property
|
||||
def functions(self) -> List[ParserFunction]:
|
||||
return [s for s in self._symbols if isinstance(s, ParserFunction)]
|
||||
|
||||
@property
|
||||
def vtables(self) -> List[ParserVtable]:
|
||||
return [s for s in self._symbols if isinstance(s, ParserVtable)]
|
||||
|
||||
@property
|
||||
def variables(self) -> List[ParserVariable]:
|
||||
return [s for s in self._symbols if isinstance(s, ParserVariable)]
|
||||
|
||||
@property
|
||||
def strings(self) -> List[ParserString]:
|
||||
return [s for s in self._symbols if isinstance(s, ParserString)]
|
||||
|
||||
def iter_symbols(self, module: Optional[str] = None) -> Iterator[ParserSymbol]:
|
||||
for s in self._symbols:
|
||||
if module is None or s.module == module:
|
||||
yield s
|
||||
|
||||
def _recover(self):
|
||||
"""We hit a syntax error and need to reset temp structures"""
|
||||
self.state = ReaderState.SEARCH
|
||||
self.fun_markers.empty()
|
||||
self.var_markers.empty()
|
||||
self.tbl_markers.empty()
|
||||
|
||||
def _syntax_warning(self, code):
|
||||
self.alerts.append(
|
||||
ParserAlert(
|
||||
line_number=self.line_number,
|
||||
code=code,
|
||||
line=self.last_line.strip(),
|
||||
)
|
||||
)
|
||||
|
||||
def _syntax_error(self, code):
|
||||
self._syntax_warning(code)
|
||||
self._recover()
|
||||
|
||||
def _function_starts_here(self):
|
||||
self.function_start = self.line_number
|
||||
|
||||
def _function_marker(self, marker: DecompMarker):
|
||||
if self.fun_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
self.state = ReaderState.WANT_SIG
|
||||
|
||||
def _nameref_marker(self, marker: DecompMarker):
|
||||
"""Functions explicitly referenced by name are set here"""
|
||||
if self.fun_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
|
||||
if marker.is_template():
|
||||
self.state = ReaderState.IN_TEMPLATE
|
||||
elif marker.is_synthetic():
|
||||
self.state = ReaderState.IN_SYNTHETIC
|
||||
else:
|
||||
self.state = ReaderState.IN_LIBRARY
|
||||
|
||||
def _function_done(self, lookup_by_name: bool = False, unexpected: bool = False):
|
||||
end_line = self.line_number
|
||||
if unexpected:
|
||||
# If we missed the end of the previous function, assume it ended
|
||||
# on the previous line and that whatever we are tracking next
|
||||
# begins on the current line.
|
||||
end_line -= 1
|
||||
|
||||
for marker in self.fun_markers.iter():
|
||||
self._symbols.append(
|
||||
ParserFunction(
|
||||
type=marker.type,
|
||||
line_number=self.function_start,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
name=self.function_sig,
|
||||
lookup_by_name=lookup_by_name,
|
||||
end_line=end_line,
|
||||
)
|
||||
)
|
||||
|
||||
self.fun_markers.empty()
|
||||
self.curly_indent_stops = 0
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
def _vtable_marker(self, marker: DecompMarker):
|
||||
if self.tbl_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
self.state = ReaderState.IN_VTABLE
|
||||
|
||||
def _vtable_done(self, class_name: str = None):
|
||||
if class_name is None:
|
||||
# Best we can do
|
||||
class_name = self.last_line.strip()
|
||||
|
||||
for marker in self.tbl_markers.iter():
|
||||
self._symbols.append(
|
||||
ParserVtable(
|
||||
type=marker.type,
|
||||
line_number=self.line_number,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
name=self.curly.get_prefix(class_name),
|
||||
base_class=marker.extra,
|
||||
)
|
||||
)
|
||||
|
||||
self.tbl_markers.empty()
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
def _variable_marker(self, marker: DecompMarker):
|
||||
if self.var_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
|
||||
if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL):
|
||||
self.state = ReaderState.IN_FUNC_GLOBAL
|
||||
else:
|
||||
self.state = ReaderState.IN_GLOBAL
|
||||
|
||||
def _variable_done(
|
||||
self, variable_name: Optional[str] = None, string_value: Optional[str] = None
|
||||
):
|
||||
if variable_name is None and string_value is None:
|
||||
self._syntax_error(ParserError.NO_SUITABLE_NAME)
|
||||
return
|
||||
|
||||
for marker in self.var_markers.iter():
|
||||
if marker.is_string():
|
||||
self._symbols.append(
|
||||
ParserString(
|
||||
type=marker.type,
|
||||
line_number=self.line_number,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
name=string_value,
|
||||
)
|
||||
)
|
||||
else:
|
||||
parent_function = None
|
||||
is_static = self.state == ReaderState.IN_FUNC_GLOBAL
|
||||
|
||||
# If this is a static variable, we need to get the function
|
||||
# where it resides so that we can match it up later with the
|
||||
# mangled names of both variable and function from cvdump.
|
||||
if is_static:
|
||||
fun_marker = self.fun_markers.query(
|
||||
MarkerCategory.FUNCTION, marker.module
|
||||
)
|
||||
|
||||
if fun_marker is None:
|
||||
self._syntax_warning(ParserError.ORPHANED_STATIC_VARIABLE)
|
||||
continue
|
||||
|
||||
parent_function = fun_marker.offset
|
||||
|
||||
self._symbols.append(
|
||||
ParserVariable(
|
||||
type=marker.type,
|
||||
line_number=self.line_number,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
name=self.curly.get_prefix(variable_name),
|
||||
is_static=is_static,
|
||||
parent_function=parent_function,
|
||||
)
|
||||
)
|
||||
|
||||
self.var_markers.empty()
|
||||
if self.state == ReaderState.IN_FUNC_GLOBAL:
|
||||
self.state = ReaderState.IN_FUNC
|
||||
else:
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
def _handle_marker(self, marker: DecompMarker):
|
||||
# Cannot handle any markers between function sig and opening curly brace
|
||||
if self.state == ReaderState.WANT_CURLY:
|
||||
self._syntax_error(ParserError.UNEXPECTED_MARKER)
|
||||
return
|
||||
|
||||
# If we are inside a function, the only markers we accept are:
|
||||
# GLOBAL, indicating a static variable
|
||||
# STRING, indicating a literal string.
|
||||
# Otherwise we assume that the parser missed the end of the function
|
||||
# and we have moved on to something else.
|
||||
# This is unlikely to occur with well-formed code, but
|
||||
# we can recover easily by just ending the function here.
|
||||
if self.state == ReaderState.IN_FUNC and not marker.allowed_in_func():
|
||||
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
|
||||
self._function_done(unexpected=True)
|
||||
|
||||
# TODO: How uncertain are we of detecting the end of a function
|
||||
# in a clang-formatted file? For now we assume we have missed the
|
||||
# end if we detect a non-GLOBAL marker while state is IN_FUNC.
|
||||
# Maybe these cases should be syntax errors instead
|
||||
|
||||
if marker.is_regular_function():
|
||||
if self.state in (
|
||||
ReaderState.SEARCH,
|
||||
ReaderState.WANT_SIG,
|
||||
):
|
||||
# We will allow multiple offsets if we have just begun
|
||||
# the code block, but not after we hit the curly brace.
|
||||
self._function_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif marker.is_template():
|
||||
if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE):
|
||||
self._nameref_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif marker.is_synthetic():
|
||||
if self.state in (ReaderState.SEARCH, ReaderState.IN_SYNTHETIC):
|
||||
self._nameref_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif marker.is_library():
|
||||
if self.state in (ReaderState.SEARCH, ReaderState.IN_LIBRARY):
|
||||
self._nameref_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
# Strings and variables are almost the same thing
|
||||
elif marker.is_string() or marker.is_variable():
|
||||
if self.state in (
|
||||
ReaderState.SEARCH,
|
||||
ReaderState.IN_GLOBAL,
|
||||
ReaderState.IN_FUNC,
|
||||
ReaderState.IN_FUNC_GLOBAL,
|
||||
):
|
||||
self._variable_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif marker.is_vtable():
|
||||
if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE):
|
||||
self._vtable_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
else:
|
||||
self._syntax_warning(ParserError.BOGUS_MARKER)
|
||||
|
||||
def read_line(self, line: str):
|
||||
if self.state == ReaderState.DONE:
|
||||
return
|
||||
|
||||
self.last_line = line # TODO: Useful or hack for error reporting?
|
||||
self.line_number += 1
|
||||
|
||||
marker = match_marker(line)
|
||||
if marker is not None:
|
||||
# TODO: what's the best place for this?
|
||||
# Does it belong with reading or marker handling?
|
||||
if not is_marker_exact(self.last_line):
|
||||
self._syntax_warning(ParserError.BAD_DECOMP_MARKER)
|
||||
self._handle_marker(marker)
|
||||
return
|
||||
|
||||
self.curly.read_line(line)
|
||||
|
||||
line_strip = line.strip()
|
||||
if self.state in (
|
||||
ReaderState.IN_SYNTHETIC,
|
||||
ReaderState.IN_TEMPLATE,
|
||||
ReaderState.IN_LIBRARY,
|
||||
):
|
||||
# Explicit nameref functions provide the function name
|
||||
# on the next line (in a // comment)
|
||||
name = get_synthetic_name(line)
|
||||
if name is None:
|
||||
self._syntax_error(ParserError.BAD_NAMEREF)
|
||||
else:
|
||||
self.function_sig = name
|
||||
self._function_starts_here()
|
||||
self._function_done(lookup_by_name=True)
|
||||
|
||||
elif self.state == ReaderState.WANT_SIG:
|
||||
# Ignore blanks on the way to function start or function name
|
||||
if len(line_strip) == 0:
|
||||
self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
|
||||
|
||||
elif line_strip.startswith("//"):
|
||||
# If we found a comment, assume implicit lookup-by-name
|
||||
# function and end here. We know this is not a decomp marker
|
||||
# because it would have been handled already.
|
||||
self.function_sig = get_synthetic_name(line)
|
||||
self._function_starts_here()
|
||||
self._function_done(lookup_by_name=True)
|
||||
|
||||
elif line_strip == "{":
|
||||
# We missed the function signature but we can recover from this
|
||||
self.function_sig = "(unknown)"
|
||||
self._function_starts_here()
|
||||
self._syntax_warning(ParserError.MISSED_START_OF_FUNCTION)
|
||||
self.state = ReaderState.IN_FUNC
|
||||
|
||||
else:
|
||||
# Inline functions may end with a comment. Strip that out
|
||||
# to help parsing.
|
||||
self.function_sig = remove_trailing_comment(line_strip)
|
||||
|
||||
# Now check to see if the opening curly bracket is on the
|
||||
# same line. clang-format should prevent this (BraceWrapping)
|
||||
# but it is easy to detect.
|
||||
# If the entire function is on one line, handle that too.
|
||||
if self.function_sig.endswith("{"):
|
||||
self._function_starts_here()
|
||||
self.state = ReaderState.IN_FUNC
|
||||
elif self.function_sig.endswith("}") or self.function_sig.endswith(
|
||||
"};"
|
||||
):
|
||||
self._function_starts_here()
|
||||
self._function_done()
|
||||
elif self.function_sig.endswith(");"):
|
||||
# Detect forward reference or declaration
|
||||
self._syntax_error(ParserError.NO_IMPLEMENTATION)
|
||||
else:
|
||||
self.state = ReaderState.WANT_CURLY
|
||||
|
||||
elif self.state == ReaderState.WANT_CURLY:
|
||||
if line_strip == "{":
|
||||
self.curly_indent_stops = line.index("{")
|
||||
self._function_starts_here()
|
||||
self.state = ReaderState.IN_FUNC
|
||||
|
||||
elif self.state == ReaderState.IN_FUNC:
|
||||
if line_strip.startswith("}") and line[self.curly_indent_stops] == "}":
|
||||
self._function_done()
|
||||
|
||||
elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
|
||||
# TODO: Known problem that an error here will cause us to abandon a
|
||||
# function we have already parsed if state == IN_FUNC_GLOBAL.
|
||||
# However, we are not tolerant of _any_ syntax problems in our
|
||||
# CI actions, so the solution is to just fix the invalid marker.
|
||||
variable_name = None
|
||||
|
||||
global_markers_queued = any(
|
||||
m.is_variable() for m in self.var_markers.iter()
|
||||
)
|
||||
|
||||
if len(line_strip) == 0:
|
||||
self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
|
||||
return
|
||||
|
||||
if global_markers_queued:
|
||||
# Not the greatest solution, but a consequence of combining GLOBAL and
|
||||
# STRING markers together. If the marker precedes a return statement, it is
|
||||
# valid for a STRING marker to be here, but not a GLOBAL. We need to look
|
||||
# ahead and tell whether this *would* fail.
|
||||
if line_strip.startswith("return"):
|
||||
self._syntax_error(ParserError.GLOBAL_NOT_VARIABLE)
|
||||
return
|
||||
if line_strip.startswith("//"):
|
||||
# If we found a comment, assume implicit lookup-by-name
|
||||
# function and end here. We know this is not a decomp marker
|
||||
# because it would have been handled already.
|
||||
variable_name = get_synthetic_name(line)
|
||||
else:
|
||||
variable_name = get_variable_name(line)
|
||||
|
||||
string_name = get_string_contents(line)
|
||||
|
||||
self._variable_done(variable_name, string_name)
|
||||
|
||||
elif self.state == ReaderState.IN_VTABLE:
|
||||
vtable_class = get_class_name(line)
|
||||
if vtable_class is not None:
|
||||
self._vtable_done(class_name=vtable_class)
|
||||
|
||||
def read_lines(self, lines: Iterable):
|
||||
for line in lines:
|
||||
self.read_line(line)
|
||||
|
||||
def finish(self):
|
||||
if self.state != ReaderState.SEARCH:
|
||||
self._syntax_warning(ParserError.UNEXPECTED_END_OF_FILE)
|
||||
|
||||
self.state = ReaderState.DONE
|
@ -1,141 +0,0 @@
|
||||
# C++ Parser utility functions and data structures
|
||||
import re
|
||||
from typing import Optional
|
||||
from ast import literal_eval
|
||||
|
||||
# The goal here is to just read whatever is on the next line, so some
|
||||
# flexibility in the formatting seems OK
|
||||
templateCommentRegex = re.compile(r"\s*//\s+(.*)")
|
||||
|
||||
# To remove any comment (//) or block comment (/*) and its leading spaces
|
||||
# from the end of a code line
|
||||
trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
|
||||
|
||||
# Get char contents, ignore escape characters
|
||||
singleQuoteRegex = re.compile(r"('(?:[^\'\\]|\\.)')")
|
||||
|
||||
# Match contents of block comment on one line
|
||||
blockCommentRegex = re.compile(r"(/\*.*?\*/)")
|
||||
|
||||
# Match contents of single comment on one line
|
||||
regularCommentRegex = re.compile(r"(//.*)")
|
||||
|
||||
# Get string contents, ignore escape characters that might interfere
|
||||
doubleQuoteRegex = re.compile(r"(\"(?:[^\"\\]|\\.)*\")")
|
||||
|
||||
# Detect a line that would cause us to enter a new scope
|
||||
scopeDetectRegex = re.compile(r"(?:class|struct|namespace) (?P<name>\w+).*(?:{)?")
|
||||
|
||||
|
||||
def get_synthetic_name(line: str) -> Optional[str]:
|
||||
"""Synthetic names appear on a single line comment on the line after the marker.
|
||||
If that's not what we have, return None"""
|
||||
template_match = templateCommentRegex.match(line)
|
||||
|
||||
if template_match is not None:
|
||||
return template_match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def sanitize_code_line(line: str) -> str:
|
||||
"""Helper for scope manager. Removes sections from a code line
|
||||
that would cause us to incorrectly detect curly brackets.
|
||||
This is a very naive implementation and fails entirely on multi-line
|
||||
strings or comments."""
|
||||
|
||||
line = singleQuoteRegex.sub("''", line)
|
||||
line = doubleQuoteRegex.sub('""', line)
|
||||
line = blockCommentRegex.sub("", line)
|
||||
line = regularCommentRegex.sub("", line)
|
||||
|
||||
return line.strip()
|
||||
|
||||
|
||||
def remove_trailing_comment(line: str) -> str:
|
||||
return trailingCommentRegex.sub("", line)
|
||||
|
||||
|
||||
def is_blank_or_comment(line: str) -> bool:
|
||||
"""Helper to read ahead after the offset comment is matched.
|
||||
There could be blank lines or other comments before the
|
||||
function signature, and we want to skip those."""
|
||||
line_strip = line.strip()
|
||||
return (
|
||||
len(line_strip) == 0
|
||||
or line_strip.startswith("//")
|
||||
or line_strip.startswith("/*")
|
||||
or line_strip.endswith("*/")
|
||||
)
|
||||
|
||||
|
||||
template_regex = re.compile(r"<(?P<type>[\w]+)\s*(?P<asterisks>\*+)?\s*>")
|
||||
|
||||
|
||||
class_decl_regex = re.compile(
|
||||
r"\s*(?:\/\/)?\s*(?:class|struct) ((?:\w+(?:<.+>)?(?:::)?)+)"
|
||||
)
|
||||
|
||||
|
||||
def template_replace(match: re.Match) -> str:
|
||||
(type_name, asterisks) = match.groups()
|
||||
if asterisks is None:
|
||||
return f"<{type_name}>"
|
||||
|
||||
return f"<{type_name} {asterisks}>"
|
||||
|
||||
|
||||
def fix_template_type(class_name: str) -> str:
|
||||
"""For template classes, we should reformat the class name so it matches
|
||||
the output from cvdump: one space between the template type and any asterisks
|
||||
if it is a pointer type."""
|
||||
if "<" not in class_name:
|
||||
return class_name
|
||||
|
||||
return template_regex.sub(template_replace, class_name)
|
||||
|
||||
|
||||
def get_class_name(line: str) -> Optional[str]:
|
||||
"""For VTABLE markers, extract the class name from the code line or comment
|
||||
where it appears."""
|
||||
|
||||
match = class_decl_regex.match(line)
|
||||
if match is not None:
|
||||
return fix_template_type(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
global_regex = re.compile(r"(?P<name>(?:\w+::)*g_\w+)")
|
||||
less_strict_global_regex = re.compile(r"(?P<name>(?:\w+::)*\w+)(?:\)\(|\[.*|\s*=.*|;)")
|
||||
|
||||
|
||||
def get_variable_name(line: str) -> Optional[str]:
|
||||
"""Grab the name of the variable annotated with the GLOBAL marker.
|
||||
Correct syntax would have the variable start with the prefix "g_"
|
||||
but we will try to match regardless."""
|
||||
|
||||
if (match := global_regex.search(line)) is not None:
|
||||
return match.group("name")
|
||||
|
||||
if (match := less_strict_global_regex.search(line)) is not None:
|
||||
return match.group("name")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_string_contents(line: str) -> Optional[str]:
|
||||
"""Return the first C string seen on this line.
|
||||
We have to unescape the string, and a simple way to do that is to use
|
||||
python's ast.literal_eval. I'm sure there are many pitfalls to doing
|
||||
it this way, but hopefully the regex will ensure reasonably sane input."""
|
||||
|
||||
try:
|
||||
if (match := doubleQuoteRegex.search(line)) is not None:
|
||||
return literal_eval(match.group(1))
|
||||
# pylint: disable=broad-exception-caught
|
||||
# No way to predict what kind of exception could occur.
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
@ -1,13 +0,0 @@
|
||||
"""Types shared by other modules"""
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SymbolType(Enum):
|
||||
"""Broadly tells us what kind of comparison is required for this symbol."""
|
||||
|
||||
FUNCTION = 1
|
||||
DATA = 2
|
||||
POINTER = 3
|
||||
STRING = 4
|
||||
VTABLE = 5
|
||||
FLOAT = 6
|
@ -1,308 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import colorama
|
||||
|
||||
|
||||
def print_combined_diff(udiff, plain: bool = False, show_both: bool = False):
|
||||
if udiff is None:
|
||||
return
|
||||
|
||||
# We don't know how long the address string will be ahead of time.
|
||||
# Set this value for each address to try to line things up.
|
||||
padding_size = 0
|
||||
|
||||
for slug, subgroups in udiff:
|
||||
if plain:
|
||||
print("---")
|
||||
print("+++")
|
||||
print(slug)
|
||||
else:
|
||||
print(f"{colorama.Fore.RED}---")
|
||||
print(f"{colorama.Fore.GREEN}+++")
|
||||
print(f"{colorama.Fore.BLUE}{slug}")
|
||||
print(colorama.Style.RESET_ALL, end="")
|
||||
|
||||
for subgroup in subgroups:
|
||||
equal = subgroup.get("both") is not None
|
||||
|
||||
if equal:
|
||||
for orig_addr, line, recomp_addr in subgroup["both"]:
|
||||
padding_size = max(padding_size, len(orig_addr))
|
||||
if show_both:
|
||||
print(f"{orig_addr} / {recomp_addr} : {line}")
|
||||
else:
|
||||
print(f"{orig_addr} : {line}")
|
||||
else:
|
||||
for orig_addr, line in subgroup["orig"]:
|
||||
padding_size = max(padding_size, len(orig_addr))
|
||||
addr_prefix = (
|
||||
f"{orig_addr} / {'':{padding_size}}" if show_both else orig_addr
|
||||
)
|
||||
|
||||
if plain:
|
||||
print(f"{addr_prefix} : -{line}")
|
||||
else:
|
||||
print(
|
||||
f"{addr_prefix} : {colorama.Fore.RED}-{line}{colorama.Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
for recomp_addr, line in subgroup["recomp"]:
|
||||
padding_size = max(padding_size, len(recomp_addr))
|
||||
addr_prefix = (
|
||||
f"{'':{padding_size}} / {recomp_addr}"
|
||||
if show_both
|
||||
else " " * padding_size
|
||||
)
|
||||
|
||||
if plain:
|
||||
print(f"{addr_prefix} : +{line}")
|
||||
else:
|
||||
print(
|
||||
f"{addr_prefix} : {colorama.Fore.GREEN}+{line}{colorama.Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
# Newline between each diff subgroup.
|
||||
print()
|
||||
|
||||
|
||||
def print_diff(udiff, plain):
|
||||
"""Print diff in difflib.unified_diff format."""
|
||||
if udiff is None:
|
||||
return False
|
||||
|
||||
has_diff = False
|
||||
for line in udiff:
|
||||
has_diff = True
|
||||
color = ""
|
||||
if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
|
||||
# Skip unneeded parts of the diff for the brief view
|
||||
continue
|
||||
# Work out color if we are printing color
|
||||
if not plain:
|
||||
if line.startswith("+"):
|
||||
color = colorama.Fore.GREEN
|
||||
elif line.startswith("-"):
|
||||
color = colorama.Fore.RED
|
||||
print(color + line)
|
||||
# Reset color if we're printing in color
|
||||
if not plain:
|
||||
print(colorama.Style.RESET_ALL, end="")
|
||||
return has_diff
|
||||
|
||||
|
||||
def get_percent_color(value: float) -> str:
|
||||
"""Return colorama ANSI escape character for the given decimal value."""
|
||||
if value == 1.0:
|
||||
return colorama.Fore.GREEN
|
||||
if value > 0.8:
|
||||
return colorama.Fore.YELLOW
|
||||
|
||||
return colorama.Fore.RED
|
||||
|
||||
|
||||
def percent_string(
|
||||
ratio: float, is_effective: bool = False, is_plain: bool = False
|
||||
) -> str:
|
||||
"""Helper to construct a percentage string from the given ratio.
|
||||
If is_effective (i.e. effective match), indicate that with the asterisk.
|
||||
If is_plain, don't use colorama ANSI codes."""
|
||||
|
||||
percenttext = f"{(ratio * 100):.2f}%"
|
||||
effective_star = "*" if is_effective else ""
|
||||
|
||||
if is_plain:
|
||||
return percenttext + effective_star
|
||||
|
||||
return "".join(
|
||||
[
|
||||
get_percent_color(ratio),
|
||||
percenttext,
|
||||
colorama.Fore.RED if is_effective else "",
|
||||
effective_star,
|
||||
colorama.Style.RESET_ALL,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def diff_json_display(show_both_addrs: bool = False, is_plain: bool = False):
|
||||
"""Generate a function that will display the diff according to
|
||||
the reccmp display preferences."""
|
||||
|
||||
def formatter(orig_addr, saved, new) -> str:
|
||||
old_pct = "new"
|
||||
new_pct = "gone"
|
||||
name = ""
|
||||
recomp_addr = "n/a"
|
||||
|
||||
if new is not None:
|
||||
new_pct = (
|
||||
"stub"
|
||||
if new.get("stub", False)
|
||||
else percent_string(
|
||||
new["matching"], new.get("effective", False), is_plain
|
||||
)
|
||||
)
|
||||
|
||||
# Prefer the current name of this function if we have it.
|
||||
# We are using the original address as the key.
|
||||
# A function being renamed is not of interest here.
|
||||
name = new.get("name", "")
|
||||
recomp_addr = new.get("recomp", "n/a")
|
||||
|
||||
if saved is not None:
|
||||
old_pct = (
|
||||
"stub"
|
||||
if saved.get("stub", False)
|
||||
else percent_string(
|
||||
saved["matching"], saved.get("effective", False), is_plain
|
||||
)
|
||||
)
|
||||
|
||||
if name == "":
|
||||
name = saved.get("name", "")
|
||||
|
||||
if show_both_addrs:
|
||||
addr_string = f"{orig_addr} / {recomp_addr:10}"
|
||||
else:
|
||||
addr_string = orig_addr
|
||||
|
||||
# The ANSI codes from colorama counted towards string length,
|
||||
# so displaying this as an ascii-like spreadsheet
|
||||
# (using f-string formatting) would take some effort.
|
||||
return f"{addr_string} - {name} ({old_pct} -> {new_pct})"
|
||||
|
||||
return formatter
|
||||
|
||||
|
||||
def diff_json(
|
||||
saved_data,
|
||||
new_data,
|
||||
orig_file: str,
|
||||
show_both_addrs: bool = False,
|
||||
is_plain: bool = False,
|
||||
):
|
||||
"""Using a saved copy of the diff summary and the current data, print a
|
||||
report showing which functions/symbols have changed match percentage."""
|
||||
|
||||
# Don't try to diff a report generated for a different binary file
|
||||
base_file = os.path.basename(orig_file).lower()
|
||||
|
||||
if saved_data.get("file") != base_file:
|
||||
logging.getLogger().error(
|
||||
"Diff report for '%s' does not match current file '%s'",
|
||||
saved_data.get("file"),
|
||||
base_file,
|
||||
)
|
||||
return
|
||||
|
||||
if "timestamp" in saved_data:
|
||||
now = datetime.now().replace(microsecond=0)
|
||||
then = datetime.fromtimestamp(saved_data["timestamp"]).replace(microsecond=0)
|
||||
|
||||
print(
|
||||
" ".join(
|
||||
[
|
||||
"Saved diff report generated",
|
||||
then.strftime("%B %d %Y, %H:%M:%S"),
|
||||
f"({str(now - then)} ago)",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
print()
|
||||
|
||||
# Convert to dict, using orig_addr as key
|
||||
saved_invert = {obj["address"]: obj for obj in saved_data["data"]}
|
||||
new_invert = {obj["address"]: obj for obj in new_data}
|
||||
|
||||
all_addrs = set(saved_invert.keys()).union(new_invert.keys())
|
||||
|
||||
# Put all the information in one place so we can decide how each item changed.
|
||||
combined = {
|
||||
addr: (
|
||||
saved_invert.get(addr),
|
||||
new_invert.get(addr),
|
||||
)
|
||||
for addr in sorted(all_addrs)
|
||||
}
|
||||
|
||||
# The criteria for diff judgement is in these dict comprehensions:
|
||||
# Any function not in the saved file
|
||||
new_functions = {
|
||||
key: (saved, new) for key, (saved, new) in combined.items() if saved is None
|
||||
}
|
||||
|
||||
# Any function now missing from the saved file
|
||||
# or a non-stub -> stub conversion
|
||||
dropped_functions = {
|
||||
key: (saved, new)
|
||||
for key, (saved, new) in combined.items()
|
||||
if new is None
|
||||
or (
|
||||
new is not None
|
||||
and saved is not None
|
||||
and new.get("stub", False)
|
||||
and not saved.get("stub", False)
|
||||
)
|
||||
}
|
||||
|
||||
# TODO: move these two into functions if the assessment gets more complex
|
||||
# Any function with increased match percentage
|
||||
# or stub -> non-stub conversion
|
||||
improved_functions = {
|
||||
key: (saved, new)
|
||||
for key, (saved, new) in combined.items()
|
||||
if saved is not None
|
||||
and new is not None
|
||||
and (
|
||||
new["matching"] > saved["matching"]
|
||||
or (not new.get("stub", False) and saved.get("stub", False))
|
||||
)
|
||||
}
|
||||
|
||||
# Any non-stub function with decreased match percentage
|
||||
degraded_functions = {
|
||||
key: (saved, new)
|
||||
for key, (saved, new) in combined.items()
|
||||
if saved is not None
|
||||
and new is not None
|
||||
and new["matching"] < saved["matching"]
|
||||
and not saved.get("stub")
|
||||
and not new.get("stub")
|
||||
}
|
||||
|
||||
# Any function with former or current "effective" match
|
||||
entropy_functions = {
|
||||
key: (saved, new)
|
||||
for key, (saved, new) in combined.items()
|
||||
if saved is not None
|
||||
and new is not None
|
||||
and new["matching"] == 1.0
|
||||
and saved["matching"] == 1.0
|
||||
and new.get("effective", False) != saved.get("effective", False)
|
||||
}
|
||||
|
||||
get_diff_str = diff_json_display(show_both_addrs, is_plain)
|
||||
|
||||
for diff_name, diff_dict in [
|
||||
("New", new_functions),
|
||||
("Increased", improved_functions),
|
||||
("Decreased", degraded_functions),
|
||||
("Dropped", dropped_functions),
|
||||
("Compiler entropy", entropy_functions),
|
||||
]:
|
||||
if len(diff_dict) == 0:
|
||||
continue
|
||||
|
||||
print(f"{diff_name} ({len(diff_dict)}):")
|
||||
|
||||
for addr, (saved, new) in diff_dict.items():
|
||||
print(get_diff_str(addr, saved, new))
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def get_file_in_script_dir(fn):
|
||||
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
|
@ -1,11 +0,0 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="isledecomp",
|
||||
version="0.1.0",
|
||||
description="Python tools for the isledecomp project",
|
||||
packages=find_packages(),
|
||||
tests_require=["pytest"],
|
||||
include_package_data=True,
|
||||
package_data={"isledecomp.lib": ["*.exe", "*.dll"]},
|
||||
)
|
@ -1,3 +0,0 @@
|
||||
def pytest_addoption(parser):
|
||||
"""Allow the option to run tests against the original LEGO1.DLL."""
|
||||
parser.addoption("--lego1", action="store", help="Path to LEGO1.DLL")
|
@ -1,30 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// A very simple class
|
||||
|
||||
// VTABLE: TEST 0x1001002
|
||||
class TestClass {
|
||||
public:
|
||||
TestClass();
|
||||
virtual ~TestClass() override;
|
||||
|
||||
virtual MxResult Tickle() override; // vtable+08
|
||||
|
||||
// FUNCTION: TEST 0x12345678
|
||||
inline const char* ClassName() const // vtable+0c
|
||||
{
|
||||
// 0xabcd1234
|
||||
return "TestClass";
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0xdeadbeef
|
||||
inline MxBool IsA(const char* name) const override // vtable+10
|
||||
{
|
||||
return !strcmp(name, TestClass::ClassName());
|
||||
}
|
||||
|
||||
private:
|
||||
int m_hello;
|
||||
int m_hiThere;
|
||||
};
|
@ -1,22 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// A very simple well-formed code file
|
||||
|
||||
// FUNCTION: TEST 0x1234
|
||||
void function01()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x2345
|
||||
void function02()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x3456
|
||||
void function03()
|
||||
{
|
||||
// TODO
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// Global variables inside and outside of functions
|
||||
|
||||
// GLOBAL: TEST 0x1000
|
||||
const char *g_message = "test";
|
||||
|
||||
// FUNCTION: TEST 0x1234
|
||||
void function01()
|
||||
{
|
||||
// GLOBAL: TEST 0x5555
|
||||
static int g_hello = 123;
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// FUNCTION: TEST 0x10000001
|
||||
inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
|
||||
|
||||
// FUNCTION: TEST 0x10000002
|
||||
inline const char* OneLine() const { return "MxDSObject"; };
|
@ -1,16 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
int no_offset_comment()
|
||||
{
|
||||
static int dummy = 123;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0xdeadbeef
|
||||
void regular_ole_function()
|
||||
{
|
||||
printf("hi there");
|
||||
}
|
@ -1,25 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// Handling multiple offset markers
|
||||
|
||||
// FUNCTION: TEST 0x1234
|
||||
// FUNCTION: HELLO 0x5555
|
||||
void different_modules()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x2345
|
||||
// FUNCTION: TEST 0x1234
|
||||
void same_module()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x2002
|
||||
// FUNCTION: test 0x1001
|
||||
void same_case_insensitive()
|
||||
{
|
||||
// TODO
|
||||
}
|
@ -1,12 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// FUNCTION: TEST 0x1234
|
||||
void short_function() { static char* msg = "oneliner"; }
|
||||
|
||||
// FUNCTION: TEST 0x5555
|
||||
void function_after_one_liner()
|
||||
{
|
||||
// This function comes after the previous that is on a single line.
|
||||
// Do we report the offset for this one correctly?
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// FUNCTION: TEST 0x1001
|
||||
void function_order01()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x1003
|
||||
void function_order03()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x1002
|
||||
void function_order02()
|
||||
{
|
||||
// TODO
|
||||
}
|
@ -1,23 +0,0 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// While it's reasonable to expect a well-formed file (and clang-format
|
||||
// will make sure we get one), this will put the parser through its paces.
|
||||
|
||||
// FUNCTION: TEST 0x1234
|
||||
void curly_with_spaces()
|
||||
{
|
||||
static char* msg = "hello";
|
||||
}
|
||||
|
||||
// FUNCTION: TEST 0x5555
|
||||
void weird_closing_curly()
|
||||
{
|
||||
int x = 123; }
|
||||
|
||||
// FUNCTION: HELLO 0x5656
|
||||
void bad_indenting() {
|
||||
if (0)
|
||||
{
|
||||
int y = 5;
|
||||
}}
|
@ -1,82 +0,0 @@
|
||||
"""Testing compare database behavior, particularly matching"""
|
||||
import pytest
|
||||
from isledecomp.compare.db import CompareDb
|
||||
|
||||
|
||||
@pytest.fixture(name="db")
|
||||
def fixture_db():
|
||||
return CompareDb()
|
||||
|
||||
|
||||
def test_ignore_recomp_collision(db):
|
||||
"""Duplicate recomp addresses are ignored"""
|
||||
db.set_recomp_symbol(0x1234, None, "hello", None, 100)
|
||||
db.set_recomp_symbol(0x1234, None, "alias_for_hello", None, 100)
|
||||
syms = db.get_all()
|
||||
assert len(syms) == 1
|
||||
|
||||
|
||||
def test_orig_collision(db):
|
||||
"""Don't match if the original address is not unique"""
|
||||
db.set_recomp_symbol(0x1234, None, "hello", None, 100)
|
||||
assert db.match_function(0x5555, "hello") is True
|
||||
|
||||
# Second run on same address fails
|
||||
assert db.match_function(0x5555, "hello") is False
|
||||
|
||||
# Call set_pair directly without wrapper
|
||||
assert db.set_pair(0x5555, 0x1234) is False
|
||||
|
||||
|
||||
def test_name_match(db):
|
||||
db.set_recomp_symbol(0x1234, None, "hello", None, 100)
|
||||
assert db.match_function(0x5555, "hello") is True
|
||||
|
||||
match = db.get_by_orig(0x5555)
|
||||
assert match.name == "hello"
|
||||
assert match.recomp_addr == 0x1234
|
||||
|
||||
|
||||
def test_match_decorated(db):
|
||||
"""Should match using decorated name even though regular name is null"""
|
||||
db.set_recomp_symbol(0x1234, None, None, "?_hello", 100)
|
||||
assert db.match_function(0x5555, "?_hello") is True
|
||||
match = db.get_by_orig(0x5555)
|
||||
assert match is not None
|
||||
|
||||
|
||||
def test_duplicate_name(db):
|
||||
"""If recomp name is not unique, match only one row"""
|
||||
db.set_recomp_symbol(0x100, None, "_Construct", None, 100)
|
||||
db.set_recomp_symbol(0x200, None, "_Construct", None, 100)
|
||||
db.set_recomp_symbol(0x300, None, "_Construct", None, 100)
|
||||
db.match_function(0x5555, "_Construct")
|
||||
matches = db.get_matches()
|
||||
# We aren't testing _which_ one would be matched, just that only one _was_ matched
|
||||
assert len(matches) == 1
|
||||
|
||||
|
||||
def test_static_variable_match(db):
|
||||
"""Set up a situation where we can match a static function variable, then match it."""
|
||||
|
||||
# We need a matched function to start with.
|
||||
db.set_recomp_symbol(0x1234, None, "Isle::Tick", "?Tick@IsleApp@@QAEXH@Z", 100)
|
||||
db.match_function(0x5555, "Isle::Tick")
|
||||
|
||||
# Decorated variable name from PDB.
|
||||
db.set_recomp_symbol(
|
||||
0x2000, None, None, "?g_startupDelay@?1??Tick@IsleApp@@QAEXH@Z@4HA", 4
|
||||
)
|
||||
|
||||
# Provide variable name and orig function address from decomp markers
|
||||
assert db.match_static_variable(0xBEEF, "g_startupDelay", 0x5555) is True
|
||||
|
||||
|
||||
def test_match_options_bool(db):
|
||||
"""Test handling of boolean match options"""
|
||||
|
||||
# You don't actually need an existing orig addr for this.
|
||||
assert db.get_match_options(0x1234) == {}
|
||||
|
||||
db.mark_stub(0x1234)
|
||||
assert "stub" in db.get_match_options(0x1234)
|
@ -1,73 +0,0 @@
|
||||
# nyuk nyuk nyuk
|
||||
import pytest
|
||||
from isledecomp.parser.parser import CurlyManager
|
||||
from isledecomp.parser.util import sanitize_code_line
|
||||
|
||||
|
||||
@pytest.fixture(name="curly")
|
||||
def fixture_curly():
|
||||
return CurlyManager()
|
||||
|
||||
|
||||
def test_simple(curly):
|
||||
curly.read_line("namespace Test {")
|
||||
assert curly.get_prefix() == "Test"
|
||||
curly.read_line("}")
|
||||
assert curly.get_prefix() == ""
|
||||
|
||||
|
||||
def test_oneliner(curly):
|
||||
"""Should not go down into a scope for a class forward reference"""
|
||||
curly.read_line("class LegoEntity;")
|
||||
assert curly.get_prefix() == ""
|
||||
# Now make sure that we still would not consider that class name
|
||||
# even after reading the opening curly brace
|
||||
curly.read_line("if (true) {")
|
||||
assert curly.get_prefix() == ""
|
||||
|
||||
|
||||
def test_ignore_comments(curly):
|
||||
curly.read_line("namespace Test {")
|
||||
curly.read_line("// }")
|
||||
assert curly.get_prefix() == "Test"
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="todo: need a real lexer")
|
||||
def test_ignore_multiline_comments(curly):
|
||||
curly.read_line("namespace Test {")
|
||||
curly.read_line("/*")
|
||||
curly.read_line("}")
|
||||
curly.read_line("*/")
|
||||
assert curly.get_prefix() == "Test"
|
||||
curly.read_line("}")
|
||||
assert curly.get_prefix() == ""
|
||||
|
||||
|
||||
def test_nested(curly):
|
||||
curly.read_line("namespace Test {")
|
||||
curly.read_line("namespace Foo {")
|
||||
assert curly.get_prefix() == "Test::Foo"
|
||||
curly.read_line("}")
|
||||
assert curly.get_prefix() == "Test"
|
||||
|
||||
|
||||
sanitize_cases = [
|
||||
("", ""),
|
||||
(" ", ""),
|
||||
("{", "{"),
|
||||
("// comments {", ""),
|
||||
("{ // why comment here", "{"),
|
||||
("/* comments */ {", "{"),
|
||||
('"curly in a string {"', '""'),
|
||||
('if (!strcmp("hello { there }", g_test)) {', 'if (!strcmp("", g_test)) {'),
|
||||
("'{'", "''"),
|
||||
("weird_function('\"', hello, '\"')", "weird_function('', hello, '')"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start, end", sanitize_cases)
|
||||
def test_sanitize(start: str, end: str):
|
||||
"""Make sure that we can remove curly braces in places where they should
|
||||
not be considered as part of the semantic structure of the file.
|
||||
i.e. inside strings or chars, and inside comments"""
|
||||
assert sanitize_code_line(start) == end
|
@ -1,59 +0,0 @@
|
||||
import pytest
|
||||
from isledecomp.cvdump.types import (
|
||||
scalar_type_size,
|
||||
scalar_type_pointer,
|
||||
scalar_type_signed,
|
||||
)
|
||||
|
||||
# These are all the types seen in the cvdump.
|
||||
# We have char, short, int, long, long long, float, and double all represented
|
||||
# in both signed and unsigned.
|
||||
# We can also identify a 4 byte pointer with the T_32 prefix.
|
||||
# The type T_VOID is used to designate a function's return type.
|
||||
# T_NOTYPE is specified as the type of "this" for a static function in a class.
|
||||
|
||||
# For reference: https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h
|
||||
|
||||
# fmt: off
|
||||
# Fields are: type_name, size, is_signed, is_pointer
|
||||
type_check_cases = (
|
||||
("T_32PINT4", 4, False, True),
|
||||
("T_32PLONG", 4, False, True),
|
||||
("T_32PRCHAR", 4, False, True),
|
||||
("T_32PREAL32", 4, False, True),
|
||||
("T_32PUCHAR", 4, False, True),
|
||||
("T_32PUINT4", 4, False, True),
|
||||
("T_32PULONG", 4, False, True),
|
||||
("T_32PUSHORT", 4, False, True),
|
||||
("T_32PVOID", 4, False, True),
|
||||
("T_CHAR", 1, True, False),
|
||||
("T_INT4", 4, True, False),
|
||||
("T_LONG", 4, True, False),
|
||||
("T_QUAD", 8, True, False),
|
||||
("T_RCHAR", 1, True, False),
|
||||
("T_REAL32", 4, True, False),
|
||||
("T_REAL64", 8, True, False),
|
||||
("T_SHORT", 2, True, False),
|
||||
("T_UCHAR", 1, False, False),
|
||||
("T_UINT4", 4, False, False),
|
||||
("T_ULONG", 4, False, False),
|
||||
("T_UQUAD", 8, False, False),
|
||||
("T_USHORT", 2, False, False),
|
||||
("T_WCHAR", 2, False, False),
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("type_name, size, _, __", type_check_cases)
|
||||
def test_scalar_size(type_name: str, size: int, _, __):
|
||||
assert scalar_type_size(type_name) == size
|
||||
|
||||
|
||||
@pytest.mark.parametrize("type_name, _, is_signed, __", type_check_cases)
|
||||
def test_scalar_signed(type_name: str, _, is_signed: bool, __):
|
||||
assert scalar_type_signed(type_name) == is_signed
|
||||
|
||||
|
||||
@pytest.mark.parametrize("type_name, _, __, is_pointer", type_check_cases)
|
||||
def test_scalar_pointer(type_name: str, _, __, is_pointer: bool):
|
||||
assert scalar_type_pointer(type_name) == is_pointer
|
@ -1,38 +0,0 @@
|
||||
"""Test Cvdump SYMBOLS parser, reading function stack/params"""
|
||||
|
||||
from isledecomp.cvdump.symbols import CvdumpSymbolsParser
|
||||
|
||||
PROC_WITH_BLOC = """
|
||||
(000638) S_GPROC32: [0001:000C6135], Cb: 00000361, Type: 0x10ED, RegistrationBook::ReadyWorld
|
||||
Parent: 00000000, End: 00000760, Next: 00000000
|
||||
Debug start: 0000000C, Debug end: 0000035C
|
||||
Flags: Frame Ptr Present
|
||||
(00067C) S_BPREL32: [FFFFFFD0], Type: 0x10EC, this
|
||||
(000690) S_BPREL32: [FFFFFFDC], Type: 0x10F5, checkmarkBuffer
|
||||
(0006AC) S_BPREL32: [FFFFFFE8], Type: 0x10F6, letterBuffer
|
||||
(0006C8) S_BPREL32: [FFFFFFF4], Type: T_SHORT(0011), i
|
||||
(0006D8) S_BPREL32: [FFFFFFF8], Type: 0x10F8, players
|
||||
(0006EC) S_BPREL32: [FFFFFFFC], Type: 0x1044, gameState
|
||||
(000704) S_BLOCK32: [0001:000C624F], Cb: 000001DA,
|
||||
Parent: 00000638, End: 0000072C
|
||||
(00071C) S_BPREL32: [FFFFFFD8], Type: T_SHORT(0011), j
|
||||
(00072C) S_END
|
||||
(000730) S_BLOCK32: [0001:000C6448], Cb: 00000032,
|
||||
Parent: 00000638, End: 0000075C
|
||||
(000748) S_BPREL32: [FFFFFFD4], Type: 0x10FA, infoman
|
||||
(00075C) S_END
|
||||
(000760) S_END
|
||||
"""
|
||||
|
||||
|
||||
def test_sblock32():
|
||||
"""S_END has double duty as marking the end of a function (S_GPROC32)
|
||||
and a scope block (S_BLOCK32). Make sure we can distinguish between
|
||||
the two and not end a function early."""
|
||||
parser = CvdumpSymbolsParser()
|
||||
for line in PROC_WITH_BLOC.split("\n"):
|
||||
parser.read_line(line)
|
||||
|
||||
# Make sure we can read the proc and all its stack references
|
||||
assert len(parser.symbols) == 1
|
||||
assert len(parser.symbols[0].stack_symbols) == 8
|
@ -1,705 +0,0 @@
|
||||
"""Specifically testing the Cvdump TYPES parser
|
||||
and type dependency tree walker."""
|
||||
|
||||
import pytest
|
||||
from isledecomp.cvdump.types import (
|
||||
CvdumpTypesParser,
|
||||
CvdumpKeyError,
|
||||
CvdumpIntegrityError,
|
||||
FieldListItem,
|
||||
VirtualBaseClass,
|
||||
VirtualBasePointer,
|
||||
)
|
||||
|
||||
TEST_LINES = """
|
||||
0x1018 : Length = 18, Leaf = 0x1201 LF_ARGLIST argument count = 3
|
||||
list[0] = 0x100D
|
||||
list[1] = 0x1016
|
||||
list[2] = 0x1017
|
||||
|
||||
0x1019 : Length = 14, Leaf = 0x1008 LF_PROCEDURE
|
||||
Return type = T_LONG(0012), Call type = C Near
|
||||
Func attr = none
|
||||
# Parms = 3, Arg list type = 0x1018
|
||||
|
||||
0x101e : Length = 26, Leaf = 0x1009 LF_MFUNCTION
|
||||
Return type = T_CHAR(0010), Class type = 0x101A, This type = 0x101B,
|
||||
Call type = ThisCall, Func attr = none
|
||||
Parms = 2, Arg list type = 0x101d, This adjust = 0
|
||||
|
||||
0x1028 : Length = 10, Leaf = 0x1001 LF_MODIFIER
|
||||
const, modifies type T_REAL32(0040)
|
||||
|
||||
0x103b : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = T_REAL32(0040)
|
||||
Index type = T_SHORT(0011)
|
||||
length = 16
|
||||
Name =
|
||||
|
||||
0x103c : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = 0x103B
|
||||
Index type = T_SHORT(0011)
|
||||
length = 64
|
||||
Name =
|
||||
|
||||
0x10e0 : Length = 86, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_MEMBER, public, type = T_REAL32(0040), offset = 0
|
||||
member name = 'x'
|
||||
list[1] = LF_MEMBER, public, type = T_REAL32(0040), offset = 0
|
||||
member name = 'dvX'
|
||||
list[2] = LF_MEMBER, public, type = T_REAL32(0040), offset = 4
|
||||
member name = 'y'
|
||||
list[3] = LF_MEMBER, public, type = T_REAL32(0040), offset = 4
|
||||
member name = 'dvY'
|
||||
list[4] = LF_MEMBER, public, type = T_REAL32(0040), offset = 8
|
||||
member name = 'z'
|
||||
list[5] = LF_MEMBER, public, type = T_REAL32(0040), offset = 8
|
||||
member name = 'dvZ'
|
||||
|
||||
0x10e1 : Length = 34, Leaf = 0x1505 LF_STRUCTURE
|
||||
# members = 6, field list type 0x10e0,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 12, class name = _D3DVECTOR, UDT(0x000010e1)
|
||||
|
||||
0x10e4 : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = T_UCHAR(0020)
|
||||
Index type = T_SHORT(0011)
|
||||
length = 8
|
||||
Name =
|
||||
|
||||
0x10ea : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = 0x1028
|
||||
Index type = T_SHORT(0011)
|
||||
length = 12
|
||||
Name =
|
||||
|
||||
0x11f0 : Length = 30, Leaf = 0x1504 LF_CLASS
|
||||
# members = 0, field list type 0x0000, FORWARD REF,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 0, class name = MxRect32, UDT(0x00001214)
|
||||
|
||||
0x11f2 : Length = 10, Leaf = 0x1001 LF_MODIFIER
|
||||
const, modifies type 0x11F0
|
||||
|
||||
0x1213 : Length = 530, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_METHOD, count = 5, list = 0x1203, name = 'MxRect32'
|
||||
list[1] = LF_ONEMETHOD, public, VANILLA, index = 0x1205, name = 'operator='
|
||||
list[2] = LF_ONEMETHOD, public, VANILLA, index = 0x11F5, name = 'Intersect'
|
||||
list[3] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'SetPoint'
|
||||
list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'AddPoint'
|
||||
list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'SubtractPoint'
|
||||
list[6] = LF_ONEMETHOD, public, VANILLA, index = 0x11F5, name = 'UpdateBounds'
|
||||
list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x1209, name = 'IsValid'
|
||||
list[8] = LF_ONEMETHOD, public, VANILLA, index = 0x120A, name = 'IntersectsWith'
|
||||
list[9] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetWidth'
|
||||
list[10] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetHeight'
|
||||
list[11] = LF_ONEMETHOD, public, VANILLA, index = 0x120C, name = 'GetPoint'
|
||||
list[12] = LF_ONEMETHOD, public, VANILLA, index = 0x120D, name = 'GetSize'
|
||||
list[13] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetLeft'
|
||||
list[14] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetTop'
|
||||
list[15] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetRight'
|
||||
list[16] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetBottom'
|
||||
list[17] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetLeft'
|
||||
list[18] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetTop'
|
||||
list[19] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetRight'
|
||||
list[20] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetBottom'
|
||||
list[21] = LF_METHOD, count = 3, list = 0x1211, name = 'CopyFrom'
|
||||
list[22] = LF_ONEMETHOD, private, STATIC, index = 0x1212, name = 'Min'
|
||||
list[23] = LF_ONEMETHOD, private, STATIC, index = 0x1212, name = 'Max'
|
||||
list[24] = LF_MEMBER, private, type = T_INT4(0074), offset = 0
|
||||
member name = 'm_left'
|
||||
list[25] = LF_MEMBER, private, type = T_INT4(0074), offset = 4
|
||||
member name = 'm_top'
|
||||
list[26] = LF_MEMBER, private, type = T_INT4(0074), offset = 8
|
||||
member name = 'm_right'
|
||||
list[27] = LF_MEMBER, private, type = T_INT4(0074), offset = 12
|
||||
member name = 'm_bottom'
|
||||
|
||||
0x1214 : Length = 30, Leaf = 0x1504 LF_CLASS
|
||||
# members = 34, field list type 0x1213, CONSTRUCTOR, OVERLOAD,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 16, class name = MxRect32, UDT(0x00001214)
|
||||
|
||||
0x1220 : Length = 30, Leaf = 0x1504 LF_CLASS
|
||||
# members = 0, field list type 0x0000, FORWARD REF,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 0, class name = MxCore, UDT(0x00004060)
|
||||
|
||||
0x14db : Length = 30, Leaf = 0x1504 LF_CLASS
|
||||
# members = 0, field list type 0x0000, FORWARD REF,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 0, class name = MxString, UDT(0x00004db6)
|
||||
|
||||
0x19b0 : Length = 34, Leaf = 0x1505 LF_STRUCTURE
|
||||
# members = 0, field list type 0x0000, FORWARD REF,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 0, class name = ROIColorAlias, UDT(0x00002a76)
|
||||
|
||||
0x19b1 : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = 0x19B0
|
||||
Index type = T_SHORT(0011)
|
||||
length = 440
|
||||
Name =
|
||||
|
||||
0x2339 : Length = 26, Leaf = 0x1506 LF_UNION
|
||||
# members = 0, field list type 0x0000, FORWARD REF, Size = 0 ,class name = FlagBitfield, UDT(0x00002e85)
|
||||
|
||||
0x2e85 : Length = 26, Leaf = 0x1506 LF_UNION
|
||||
# members = 8, field list type 0x2e84, Size = 1 ,class name = FlagBitfield, UDT(0x00002e85)
|
||||
|
||||
0x2a75 : Length = 98, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_MEMBER, public, type = T_32PRCHAR(0470), offset = 0
|
||||
member name = 'm_name'
|
||||
list[1] = LF_MEMBER, public, type = T_INT4(0074), offset = 4
|
||||
member name = 'm_red'
|
||||
list[2] = LF_MEMBER, public, type = T_INT4(0074), offset = 8
|
||||
member name = 'm_green'
|
||||
list[3] = LF_MEMBER, public, type = T_INT4(0074), offset = 12
|
||||
member name = 'm_blue'
|
||||
list[4] = LF_MEMBER, public, type = T_INT4(0074), offset = 16
|
||||
member name = 'm_unk0x10'
|
||||
|
||||
0x2a76 : Length = 34, Leaf = 0x1505 LF_STRUCTURE
|
||||
# members = 5, field list type 0x2a75,
|
||||
Derivation list type 0x0000, VT shape type 0x0000
|
||||
Size = 20, class name = ROIColorAlias, UDT(0x00002a76)
|
||||
|
||||
0x22d4 : Length = 154, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_VFUNCTAB, type = 0x20FC
|
||||
list[1] = LF_METHOD, count = 3, list = 0x22D0, name = 'MxVariable'
|
||||
list[2] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F0F,
|
||||
vfptr offset = 0, name = 'GetValue'
|
||||
list[3] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F10,
|
||||
vfptr offset = 4, name = 'SetValue'
|
||||
list[4] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F11,
|
||||
vfptr offset = 8, name = '~MxVariable'
|
||||
list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x22D3, name = 'GetKey'
|
||||
list[6] = LF_MEMBER, protected, type = 0x14DB, offset = 4
|
||||
member name = 'm_key'
|
||||
list[7] = LF_MEMBER, protected, type = 0x14DB, offset = 20
|
||||
member name = 'm_value'
|
||||
|
||||
0x22d5 : Length = 34, Leaf = 0x1504 LF_CLASS
|
||||
# members = 10, field list type 0x22d4, CONSTRUCTOR,
|
||||
Derivation list type 0x0000, VT shape type 0x20fb
|
||||
Size = 36, class name = MxVariable, UDT(0x00004041)
|
||||
|
||||
0x3c45 : Length = 50, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_ENUMERATE, public, value = 1, name = 'c_read'
|
||||
list[1] = LF_ENUMERATE, public, value = 2, name = 'c_write'
|
||||
list[2] = LF_ENUMERATE, public, value = 4, name = 'c_text'
|
||||
|
||||
0x3cc2 : Length = 38, Leaf = 0x1507 LF_ENUM
|
||||
# members = 64, type = T_INT4(0074) field list type 0x3cc1
|
||||
NESTED, enum name = JukeBox::JukeBoxScript, UDT(0x00003cc2)
|
||||
|
||||
0x3fab : Length = 10, Leaf = 0x1002 LF_POINTER
|
||||
Pointer (NEAR32), Size: 0
|
||||
Element type : 0x3FAA
|
||||
|
||||
0x405f : Length = 158, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_VFUNCTAB, type = 0x2090
|
||||
list[1] = LF_ONEMETHOD, public, VANILLA, index = 0x176A, name = 'MxCore'
|
||||
list[2] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x176A,
|
||||
vfptr offset = 0, name = '~MxCore'
|
||||
list[3] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x176B,
|
||||
vfptr offset = 4, name = 'Notify'
|
||||
list[4] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x2087,
|
||||
vfptr offset = 8, name = 'Tickle'
|
||||
list[5] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x202F,
|
||||
vfptr offset = 12, name = 'ClassName'
|
||||
list[6] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x2030,
|
||||
vfptr offset = 16, name = 'IsA'
|
||||
list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x2091, name = 'GetId'
|
||||
list[8] = LF_MEMBER, private, type = T_UINT4(0075), offset = 4
|
||||
member name = 'm_id'
|
||||
|
||||
0x4060 : Length = 30, Leaf = 0x1504 LF_CLASS
|
||||
# members = 9, field list type 0x405f, CONSTRUCTOR,
|
||||
Derivation list type 0x0000, VT shape type 0x1266
|
||||
Size = 8, class name = MxCore, UDT(0x00004060)
|
||||
|
||||
0x4262 : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = 0x3CC2
|
||||
Index type = T_SHORT(0011)
|
||||
length = 24
|
||||
Name =
|
||||
|
||||
0x432f : Length = 14, Leaf = 0x1503 LF_ARRAY
|
||||
Element type = T_INT4(0074)
|
||||
Index type = T_SHORT(0011)
|
||||
length = 12
|
||||
Name =
|
||||
|
||||
0x4db5 : Length = 246, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_BCLASS, public, type = 0x1220, offset = 0
|
||||
list[1] = LF_METHOD, count = 3, list = 0x14E3, name = 'MxString'
|
||||
list[2] = LF_ONEMETHOD, public, VIRTUAL, index = 0x14DE, name = '~MxString'
|
||||
list[3] = LF_METHOD, count = 2, list = 0x14E7, name = 'operator='
|
||||
list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x14DE, name = 'ToUpperCase'
|
||||
list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x14DE, name = 'ToLowerCase'
|
||||
list[6] = LF_ONEMETHOD, public, VANILLA, index = 0x14E8, name = 'operator+'
|
||||
list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x14E9, name = 'operator+='
|
||||
list[8] = LF_ONEMETHOD, public, VANILLA, index = 0x14EB, name = 'Compare'
|
||||
list[9] = LF_ONEMETHOD, public, VANILLA, index = 0x14EC, name = 'GetData'
|
||||
list[10] = LF_ONEMETHOD, public, VANILLA, index = 0x4DB4, name = 'GetLength'
|
||||
list[11] = LF_MEMBER, private, type = T_32PRCHAR(0470), offset = 8
|
||||
member name = 'm_data'
|
||||
list[12] = LF_MEMBER, private, type = T_USHORT(0021), offset = 12
|
||||
member name = 'm_length'
|
||||
|
||||
|
||||
0x4dee : Length = 406, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_VBCLASS, public, direct base type = 0x15EA
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
|
||||
list[1] = LF_IVBCLASS, public, indirect base type = 0x1183
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
|
||||
list[2] = LF_IVBCLASS, public, indirect base type = 0x1468
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
|
||||
list[3] = LF_VFUNCTAB, type = 0x2B95
|
||||
list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x15C2, name = 'LegoRaceMap'
|
||||
list[5] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C3, name = '~LegoRaceMap'
|
||||
list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C5, name = 'Notify'
|
||||
list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C4, name = 'ParseAction'
|
||||
list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x4DED, name = 'VTable0x70'
|
||||
list[9] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15C2,
|
||||
vfptr offset = 0, name = 'FUN_1005d4b0'
|
||||
list[10] = LF_MEMBER, private, type = T_UCHAR(0020), offset = 8
|
||||
member name = 'm_parentClass2Field1'
|
||||
list[11] = LF_MEMBER, private, type = T_32PVOID(0403), offset = 12
|
||||
member name = 'm_parentClass2Field2'
|
||||
|
||||
0x4def : Length = 34, Leaf = 0x1504 LF_CLASS
|
||||
# members = 21, field list type 0x4dee, CONSTRUCTOR,
|
||||
Derivation list type 0x0000, VT shape type 0x12a0
|
||||
Size = 436, class name = LegoRaceMap, UDT(0x00004def)
|
||||
|
||||
0x4db6 : Length = 30, Leaf = 0x1504 LF_CLASS
|
||||
# members = 16, field list type 0x4db5, CONSTRUCTOR, OVERLOAD,
|
||||
Derivation list type 0x0000, VT shape type 0x1266
|
||||
Size = 16, class name = MxString, UDT(0x00004db6)
|
||||
|
||||
0x5591 : Length = 570, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_VBCLASS, public, direct base type = 0x15EA
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
|
||||
list[1] = LF_IVBCLASS, public, indirect base type = 0x1183
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
|
||||
list[2] = LF_IVBCLASS, public, indirect base type = 0x1468
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
|
||||
list[3] = LF_VFUNCTAB, type = 0x4E11
|
||||
list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x1ABD, name = 'LegoCarRaceActor'
|
||||
list[5] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1AE0, name = 'ClassName'
|
||||
list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1AE1, name = 'IsA'
|
||||
list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADD, name = 'VTable0x6c'
|
||||
list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADB, name = 'VTable0x70'
|
||||
list[9] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADA, name = 'SwitchBoundary'
|
||||
list[10] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADC, name = 'VTable0x9c'
|
||||
list[11] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x558E,
|
||||
vfptr offset = 0, name = 'FUN_10080590'
|
||||
list[12] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
|
||||
vfptr offset = 4, name = 'FUN_10012bb0'
|
||||
list[13] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
|
||||
vfptr offset = 8, name = 'FUN_10012bc0'
|
||||
list[14] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
|
||||
vfptr offset = 12, name = 'FUN_10012bd0'
|
||||
list[15] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
|
||||
vfptr offset = 16, name = 'FUN_10012be0'
|
||||
list[16] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
|
||||
vfptr offset = 20, name = 'FUN_10012bf0'
|
||||
list[17] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
|
||||
vfptr offset = 24, name = 'FUN_10012c00'
|
||||
list[18] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1ABD,
|
||||
vfptr offset = 28, name = 'VTable0x1c'
|
||||
list[19] = LF_MEMBER, protected, type = T_REAL32(0040), offset = 8
|
||||
member name = 'm_parentClass1Field1'
|
||||
list[25] = LF_ONEMETHOD, public, VIRTUAL, (compgenx), index = 0x15D1, name = '~LegoCarRaceActor'
|
||||
|
||||
0x5592 : Length = 38, Leaf = 0x1504 LF_CLASS
|
||||
# members = 26, field list type 0x5591, CONSTRUCTOR,
|
||||
Derivation list type 0x0000, VT shape type 0x34c7
|
||||
Size = 416, class name = LegoCarRaceActor, UDT(0x00005592)
|
||||
|
||||
0x5593 : Length = 638, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_BCLASS, public, type = 0x5592, offset = 0
|
||||
list[1] = LF_BCLASS, public, type = 0x4DEF, offset = 32
|
||||
list[2] = LF_IVBCLASS, public, indirect base type = 0x1183
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
|
||||
list[3] = LF_IVBCLASS, public, indirect base type = 0x1468
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
|
||||
list[4] = LF_IVBCLASS, public, indirect base type = 0x15EA
|
||||
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
|
||||
list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x15CD, name = 'LegoRaceCar'
|
||||
list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15CE, name = '~LegoRaceCar'
|
||||
list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D2, name = 'Notify'
|
||||
list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E8, name = 'ClassName'
|
||||
list[9] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E9, name = 'IsA'
|
||||
list[10] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D5, name = 'ParseAction'
|
||||
list[11] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D3, name = 'SetWorldSpeed'
|
||||
list[12] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DF, name = 'VTable0x6c'
|
||||
list[13] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D3, name = 'VTable0x70'
|
||||
list[14] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DC, name = 'VTable0x94'
|
||||
list[15] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E5, name = 'SwitchBoundary'
|
||||
list[16] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DD, name = 'VTable0x9c'
|
||||
list[17] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15D4,
|
||||
vfptr offset = 32, name = 'SetMaxLinearVelocity'
|
||||
list[18] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15D4,
|
||||
vfptr offset = 36, name = 'FUN_10012ff0'
|
||||
list[19] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x5588,
|
||||
vfptr offset = 40, name = 'HandleSkeletonKicks'
|
||||
list[20] = LF_MEMBER, private, type = T_UCHAR(0020), offset = 84
|
||||
member name = 'm_childClassField'
|
||||
|
||||
0x5594 : Length = 34, Leaf = 0x1504 LF_CLASS
|
||||
# members = 30, field list type 0x5593, CONSTRUCTOR,
|
||||
Derivation list type 0x0000, VT shape type 0x2d1e
|
||||
Size = 512, class name = LegoRaceCar, UDT(0x000055bb)
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture(name="parser")
|
||||
def types_parser_fixture():
|
||||
parser = CvdumpTypesParser()
|
||||
for line in TEST_LINES.split("\n"):
|
||||
parser.read_line(line)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def test_basic_parsing(parser: CvdumpTypesParser):
|
||||
obj = parser.keys["0x4db6"]
|
||||
assert obj["type"] == "LF_CLASS"
|
||||
assert obj["name"] == "MxString"
|
||||
assert obj["udt"] == "0x4db6"
|
||||
|
||||
assert len(parser.keys["0x4db5"]["members"]) == 2
|
||||
|
||||
|
||||
def test_scalar_types(parser: CvdumpTypesParser):
|
||||
"""Full tests on the scalar_* methods are in another file.
|
||||
Here we are just testing the passthrough of the "T_" types."""
|
||||
assert parser.get("T_CHAR").name is None
|
||||
assert parser.get("T_CHAR").size == 1
|
||||
|
||||
assert parser.get("T_32PVOID").name is None
|
||||
assert parser.get("T_32PVOID").size == 4
|
||||
|
||||
|
||||
def test_resolve_forward_ref(parser: CvdumpTypesParser):
|
||||
# Non-forward ref
|
||||
assert parser.get("0x22d5").name == "MxVariable"
|
||||
# Forward ref
|
||||
assert parser.get("0x14db").name == "MxString"
|
||||
assert parser.get("0x14db").size == 16
|
||||
|
||||
|
||||
def test_members(parser: CvdumpTypesParser):
|
||||
"""Return the list of items to compare for a given complex type.
|
||||
If the class has a superclass, add those members too."""
|
||||
# MxCore field list
|
||||
mxcore_members = parser.get_scalars("0x405f")
|
||||
assert mxcore_members == [
|
||||
(0, "vftable", "T_32PVOID"),
|
||||
(4, "m_id", "T_UINT4"),
|
||||
]
|
||||
|
||||
# MxCore class id. Should be the same members
|
||||
assert mxcore_members == parser.get_scalars("0x4060")
|
||||
|
||||
# MxString field list. Should add inherited members from MxCore
|
||||
assert parser.get_scalars("0x4db5") == [
|
||||
(0, "vftable", "T_32PVOID"),
|
||||
(4, "m_id", "T_UINT4"),
|
||||
(8, "m_data", "T_32PRCHAR"),
|
||||
(12, "m_length", "T_USHORT"),
|
||||
]
|
||||
|
||||
# LegoRaceCar with multiple superclasses
|
||||
assert parser.get("0x5594").members == [
|
||||
FieldListItem(offset=0, name="vftable", type="T_32PVOID"),
|
||||
FieldListItem(offset=0, name="vftable", type="T_32PVOID"),
|
||||
FieldListItem(offset=8, name="m_parentClass1Field1", type="T_REAL32"),
|
||||
FieldListItem(offset=8, name="m_parentClass2Field1", type="T_UCHAR"),
|
||||
FieldListItem(offset=12, name="m_parentClass2Field2", type="T_32PVOID"),
|
||||
FieldListItem(offset=84, name="m_childClassField", type="T_UCHAR"),
|
||||
]
|
||||
|
||||
|
||||
def test_virtual_base_classes(parser: CvdumpTypesParser):
|
||||
"""Make sure that virtual base classes are parsed correctly."""
|
||||
|
||||
lego_car_race_actor = parser.keys.get("0x5591")
|
||||
assert lego_car_race_actor is not None
|
||||
assert lego_car_race_actor["vbase"] == VirtualBasePointer(
|
||||
vboffset=4,
|
||||
bases=[
|
||||
VirtualBaseClass(type="0x1183", index=1, direct=False),
|
||||
VirtualBaseClass(type="0x1468", index=2, direct=False),
|
||||
VirtualBaseClass(type="0x15EA", index=3, direct=True),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_members_recursive(parser: CvdumpTypesParser):
|
||||
"""Make sure that we unwrap the dependency tree correctly."""
|
||||
# MxVariable field list
|
||||
assert parser.get_scalars("0x22d4") == [
|
||||
(0, "vftable", "T_32PVOID"),
|
||||
(4, "m_key.vftable", "T_32PVOID"),
|
||||
(8, "m_key.m_id", "T_UINT4"),
|
||||
(12, "m_key.m_data", "T_32PRCHAR"),
|
||||
(16, "m_key.m_length", "T_USHORT"), # with padding
|
||||
(20, "m_value.vftable", "T_32PVOID"),
|
||||
(24, "m_value.m_id", "T_UINT4"),
|
||||
(28, "m_value.m_data", "T_32PRCHAR"),
|
||||
(32, "m_value.m_length", "T_USHORT"), # with padding
|
||||
]
|
||||
|
||||
|
||||
def test_struct(parser: CvdumpTypesParser):
|
||||
"""Basic test for converting type into struct.unpack format string."""
|
||||
# MxCore: vftable and uint32. The vftable pointer is read as uint32.
|
||||
assert parser.get_format_string("0x4060") == "<LL"
|
||||
|
||||
# _D3DVECTOR, three floats. Union types should already be removed.
|
||||
assert parser.get_format_string("0x10e1") == "<fff"
|
||||
|
||||
# MxRect32, four signed ints.
|
||||
assert parser.get_format_string("0x1214") == "<llll"
|
||||
|
||||
|
||||
def test_struct_padding(parser: CvdumpTypesParser):
|
||||
"""For data comparison purposes, make sure we have no gaps in the
|
||||
list of scalar types. Any gap is filled by an unsigned char."""
|
||||
|
||||
# MxString, padded to 16 bytes. 4 actual members. 2 bytes of padding.
|
||||
assert len(parser.get_scalars("0x4db6")) == 4
|
||||
assert len(parser.get_scalars_gapless("0x4db6")) == 6
|
||||
|
||||
# MxVariable, with two MxStrings (and a vtable)
|
||||
# Fill in the middle gap and the outer gap.
|
||||
assert len(parser.get_scalars("0x22d5")) == 9
|
||||
assert len(parser.get_scalars_gapless("0x22d5")) == 13
|
||||
|
||||
|
||||
def test_struct_format_string(parser: CvdumpTypesParser):
|
||||
"""Generate the struct.unpack format string using the
|
||||
list of scalars with padding filled in."""
|
||||
# MxString, padded to 16 bytes.
|
||||
assert parser.get_format_string("0x4db6") == "<LLLHBB"
|
||||
|
||||
# MxVariable, with two MxString members.
|
||||
assert parser.get_format_string("0x22d5") == "<LLLLHBBLLLHBB"
|
||||
|
||||
|
||||
def test_array(parser: CvdumpTypesParser):
|
||||
"""LF_ARRAY members are created dynamically based on the
|
||||
total array size and the size of one element."""
|
||||
# unsigned char[8]
|
||||
assert parser.get_scalars("0x10e4") == [
|
||||
(0, "[0]", "T_UCHAR"),
|
||||
(1, "[1]", "T_UCHAR"),
|
||||
(2, "[2]", "T_UCHAR"),
|
||||
(3, "[3]", "T_UCHAR"),
|
||||
(4, "[4]", "T_UCHAR"),
|
||||
(5, "[5]", "T_UCHAR"),
|
||||
(6, "[6]", "T_UCHAR"),
|
||||
(7, "[7]", "T_UCHAR"),
|
||||
]
|
||||
|
||||
# float[4]
|
||||
assert parser.get_scalars("0x103b") == [
|
||||
(0, "[0]", "T_REAL32"),
|
||||
(4, "[1]", "T_REAL32"),
|
||||
(8, "[2]", "T_REAL32"),
|
||||
(12, "[3]", "T_REAL32"),
|
||||
]
|
||||
|
||||
|
||||
def test_2d_array(parser: CvdumpTypesParser):
|
||||
"""Make sure 2d array elements are named as we expect."""
|
||||
# float[4][4]
|
||||
float_array = parser.get_scalars("0x103c")
|
||||
assert len(float_array) == 16
|
||||
assert float_array[0] == (0, "[0][0]", "T_REAL32")
|
||||
assert float_array[1] == (4, "[0][1]", "T_REAL32")
|
||||
assert float_array[4] == (16, "[1][0]", "T_REAL32")
|
||||
assert float_array[-1] == (60, "[3][3]", "T_REAL32")
|
||||
|
||||
|
||||
def test_enum(parser: CvdumpTypesParser):
|
||||
"""LF_ENUM should equal 4-byte int"""
|
||||
assert parser.get("0x3cc2").size == 4
|
||||
assert parser.get_scalars("0x3cc2") == [(0, None, "T_INT4")]
|
||||
|
||||
# Now look at an array of enum, 24 bytes
|
||||
enum_array = parser.get_scalars("0x4262")
|
||||
assert len(enum_array) == 6 # 24 / 4
|
||||
assert enum_array[0].size == 4
|
||||
|
||||
|
||||
def test_lf_pointer(parser: CvdumpTypesParser):
|
||||
"""LF_POINTER is just a wrapper for scalar pointer type"""
|
||||
assert parser.get("0x3fab").size == 4
|
||||
# assert parser.get("0x3fab").is_pointer is True # TODO: ?
|
||||
|
||||
assert parser.get_scalars("0x3fab") == [(0, None, "T_32PVOID")]
|
||||
|
||||
|
||||
def test_key_not_exist(parser: CvdumpTypesParser):
|
||||
"""Accessing a non-existent type id should raise our exception"""
|
||||
with pytest.raises(CvdumpKeyError):
|
||||
parser.get("0xbeef")
|
||||
|
||||
with pytest.raises(CvdumpKeyError):
|
||||
parser.get_scalars("0xbeef")
|
||||
|
||||
|
||||
def test_broken_forward_ref(parser: CvdumpTypesParser):
|
||||
"""Raise an exception if we cannot follow a forward reference"""
|
||||
# Verify forward reference on MxCore
|
||||
parser.get("0x1220")
|
||||
|
||||
# Delete the MxCore LF_CLASS
|
||||
del parser.keys["0x4060"]
|
||||
|
||||
# Forward ref via 0x1220 will fail
|
||||
with pytest.raises(CvdumpKeyError):
|
||||
parser.get("0x1220")
|
||||
|
||||
|
||||
def test_null_forward_ref(parser: CvdumpTypesParser):
|
||||
"""If the forward ref object is invalid and has no forward ref id,
|
||||
raise an exception."""
|
||||
# Test MxString forward reference
|
||||
parser.get("0x14db")
|
||||
|
||||
# Delete the UDT for MxString
|
||||
del parser.keys["0x14db"]["udt"]
|
||||
|
||||
# Cannot complete the forward reference lookup
|
||||
with pytest.raises(CvdumpIntegrityError):
|
||||
parser.get("0x14db")
|
||||
|
||||
|
||||
def test_broken_array_element_ref(parser: CvdumpTypesParser):
|
||||
# Test LF_ARRAY of ROIColorAlias
|
||||
parser.get("0x19b1")
|
||||
|
||||
# Delete ROIColorAlias
|
||||
del parser.keys["0x19b0"]
|
||||
|
||||
# Type reference lookup will fail
|
||||
with pytest.raises(CvdumpKeyError):
|
||||
parser.get("0x19b1")
|
||||
|
||||
|
||||
def test_lf_modifier(parser: CvdumpTypesParser):
|
||||
"""Is this an alias for another type?"""
|
||||
# Modifies float
|
||||
assert parser.get("0x1028").size == 4
|
||||
assert parser.get_scalars("0x1028") == [(0, None, "T_REAL32")]
|
||||
|
||||
mxrect = parser.get_scalars("0x1214")
|
||||
# Modifies MxRect32 via forward ref
|
||||
assert mxrect == parser.get_scalars("0x11f2")
|
||||
|
||||
|
||||
def test_union_members(parser: CvdumpTypesParser):
|
||||
"""If there is a union somewhere in our dependency list, we can
|
||||
expect to see duplicated member offsets and names. This is ok for
|
||||
the TypeInfo tuple, but the list of ScalarType items should have
|
||||
unique offset to simplify comparison."""
|
||||
|
||||
# D3DVector type with duplicated offsets
|
||||
d3dvector = parser.get("0x10e1")
|
||||
assert d3dvector.members is not None
|
||||
assert len(d3dvector.members) == 6
|
||||
assert len([m for m in d3dvector.members if m.offset == 0]) == 2
|
||||
|
||||
# Deduplicated comparison list
|
||||
vector_items = parser.get_scalars("0x10e1")
|
||||
assert len(vector_items) == 3
|
||||
|
||||
|
||||
def test_arglist(parser: CvdumpTypesParser):
|
||||
arglist = parser.keys["0x1018"]
|
||||
assert arglist["argcount"] == 3
|
||||
assert arglist["args"] == ["0x100D", "0x1016", "0x1017"]
|
||||
|
||||
|
||||
def test_procedure(parser: CvdumpTypesParser):
|
||||
procedure = parser.keys["0x1019"]
|
||||
assert procedure == {
|
||||
"type": "LF_PROCEDURE",
|
||||
"return_type": "T_LONG(0012)",
|
||||
"call_type": "C Near",
|
||||
"func_attr": "none",
|
||||
"num_params": "3",
|
||||
"arg_list_type": "0x1018",
|
||||
}
|
||||
|
||||
|
||||
def test_mfunction(parser: CvdumpTypesParser):
|
||||
mfunction = parser.keys["0x101e"]
|
||||
assert mfunction == {
|
||||
"type": "LF_MFUNCTION",
|
||||
"return_type": "T_CHAR(0010)",
|
||||
"class_type": "0x101A",
|
||||
"this_type": "0x101B",
|
||||
"call_type": "ThisCall",
|
||||
"func_attr": "none",
|
||||
"num_params": "2",
|
||||
"arg_list_type": "0x101d",
|
||||
"this_adjust": "0",
|
||||
}
|
||||
|
||||
|
||||
def test_union_forward_ref(parser: CvdumpTypesParser):
|
||||
union = parser.keys["0x2339"]
|
||||
assert union["is_forward_ref"] is True
|
||||
assert union["udt"] == "0x2e85"
|
||||
|
||||
|
||||
def test_union(parser: CvdumpTypesParser):
|
||||
union = parser.keys["0x2e85"]
|
||||
assert union == {
|
||||
"type": "LF_UNION",
|
||||
"name": "FlagBitfield",
|
||||
"size": 1,
|
||||
"udt": "0x2e85",
|
||||
}
|
||||
|
||||
|
||||
def test_fieldlist_enumerate(parser: CvdumpTypesParser):
|
||||
fieldlist_enum = parser.keys["0x3c45"]
|
||||
assert fieldlist_enum == {
|
||||
"type": "LF_FIELDLIST",
|
||||
"variants": [
|
||||
{"name": "c_read", "value": 1},
|
||||
{"name": "c_write", "value": 2},
|
||||
{"name": "c_text", "value": 4},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
UNNAMED_UNION_DATA = """
|
||||
0x369d : Length = 34, Leaf = 0x1203 LF_FIELDLIST
|
||||
list[0] = LF_MEMBER, public, type = T_32PRCHAR(0470), offset = 0
|
||||
member name = 'sz'
|
||||
list[1] = LF_MEMBER, public, type = T_32PUSHORT(0421), offset = 0
|
||||
member name = 'wz'
|
||||
|
||||
0x369e : Length = 22, Leaf = 0x1506 LF_UNION
|
||||
# members = 2, field list type 0x369d, NESTED, Size = 4 ,class name = __unnamed
|
||||
"""
|
||||
|
||||
|
||||
def test_unnamed_union():
|
||||
"""Make sure we can parse anonymous union types without a UDT"""
|
||||
parser = CvdumpTypesParser()
|
||||
for line in UNNAMED_UNION_DATA.split("\n"):
|
||||
parser.read_line(line)
|
||||
|
||||
# Make sure we can parse the members line
|
||||
union = parser.keys["0x369e"]
|
||||
assert union["size"] == 4
|
@ -1,83 +0,0 @@
|
||||
import pytest
|
||||
from isledecomp.cvdump.demangler import (
|
||||
demangle_string_const,
|
||||
demangle_vtable,
|
||||
parse_encoded_number,
|
||||
InvalidEncodedNumberError,
|
||||
get_vtordisp_name,
|
||||
)
|
||||
|
||||
string_demangle_cases = [
|
||||
("??_C@_08LIDF@December?$AA@", 8, False),
|
||||
("??_C@_0L@EGPP@english?9nz?$AA@", 11, False),
|
||||
(
|
||||
"??_C@_1O@POHA@?$AA?$CI?$AAn?$AAu?$AAl?$AAl?$AA?$CJ?$AA?$AA?$AA?$AA?$AA?$AH?$AA?$AA?$AA?$AA?$AA?$AA?$AA?$9A?$AE?$;I@",
|
||||
14,
|
||||
True,
|
||||
),
|
||||
("??_C@_00A@?$AA@", 0, False),
|
||||
("??_C@_01A@?$AA?$AA@", 1, False),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("symbol, strlen, is_utf16", string_demangle_cases)
|
||||
def test_strings(symbol, is_utf16, strlen):
|
||||
s = demangle_string_const(symbol)
|
||||
assert s.len == strlen
|
||||
assert s.is_utf16 == is_utf16
|
||||
|
||||
|
||||
encoded_numbers = [
|
||||
("A@", 0),
|
||||
("AA@", 0), # would never happen?
|
||||
("P@", 15),
|
||||
("BA@", 16),
|
||||
("BCD@", 291),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("string, value", encoded_numbers)
|
||||
def test_encoded_numbers(string, value):
|
||||
assert parse_encoded_number(string) == value
|
||||
|
||||
|
||||
def test_invalid_encoded_number():
|
||||
with pytest.raises(InvalidEncodedNumberError):
|
||||
parse_encoded_number("Hello")
|
||||
|
||||
|
||||
vtable_cases = [
|
||||
("??_7LegoCarBuildAnimPresenter@@6B@", "LegoCarBuildAnimPresenter::`vftable'"),
|
||||
("??_7?$MxCollection@PAVLegoWorld@@@@6B@", "MxCollection<LegoWorld *>::`vftable'"),
|
||||
(
|
||||
"??_7?$MxPtrList@VLegoPathController@@@@6B@",
|
||||
"MxPtrList<LegoPathController>::`vftable'",
|
||||
),
|
||||
("??_7Renderer@Tgl@@6B@", "Tgl::Renderer::`vftable'"),
|
||||
("??_7LegoExtraActor@@6B0@@", "LegoExtraActor::`vftable'{for `LegoExtraActor'}"),
|
||||
(
|
||||
"??_7LegoExtraActor@@6BLegoAnimActor@@@",
|
||||
"LegoExtraActor::`vftable'{for `LegoAnimActor'}",
|
||||
),
|
||||
(
|
||||
"??_7LegoAnimActor@@6B?$LegoContainer@PAM@@@",
|
||||
"LegoAnimActor::`vftable'{for `LegoContainer<float *>'}",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("symbol, class_name", vtable_cases)
|
||||
def test_vtable(symbol, class_name):
|
||||
assert demangle_vtable(symbol) == class_name
|
||||
|
||||
|
||||
def test_vtordisp():
|
||||
"""Make sure we can accurately detect an adjuster thunk symbol"""
|
||||
assert get_vtordisp_name("") is None
|
||||
assert get_vtordisp_name("?ClassName@LegoExtraActor@@UBEPBDXZ") is None
|
||||
assert (
|
||||
get_vtordisp_name("?ClassName@LegoExtraActor@@$4PPPPPPPM@A@BEPBDXZ") is not None
|
||||
)
|
||||
|
||||
# A function called vtordisp
|
||||
assert get_vtordisp_name("?vtordisp@LegoExtraActor@@UBEPBDXZ") is None
|
@ -1,212 +0,0 @@
|
||||
from isledecomp.compare.asm.instgen import InstructGen, SectionType
|
||||
|
||||
|
||||
def test_ret():
|
||||
"""Make sure we can handle a function with one instruction."""
|
||||
ig = InstructGen(b"\xc3", 0)
|
||||
assert len(ig.sections) == 1
|
||||
|
||||
|
||||
SCORE_NOTIFY = (
|
||||
b"\x53\x56\x57\x8b\xd9\x33\xff\x8b\x74\x24\x10\x56\xe8\xbf\xe1\x01"
|
||||
b"\x00\x80\xbb\xf6\x00\x00\x00\x00\x0f\x84\x9c\x00\x00\x00\x8b\x4e"
|
||||
b"\x04\x49\x83\xf9\x17\x0f\x87\x8f\x00\x00\x00\x33\xc0\x8a\x81\xec"
|
||||
b"\x14\x00\x10\xff\x24\x85\xd4\x14\x00\x10\x8b\xcb\xbf\x01\x00\x00"
|
||||
b"\x00\xe8\x7a\x05\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x56\x8b"
|
||||
b"\xcb\xe8\xaa\x00\x00\x00\x8b\xf8\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00"
|
||||
b"\x80\x7e\x18\x20\x75\x07\x8b\xcb\xe8\xc3\xfe\xff\xff\xbf\x01\x00"
|
||||
b"\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x56\x8b\xcb\xe8\x3e\x02"
|
||||
b"\x00\x00\x8b\xf8\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x6a\x09\xa1\x4c"
|
||||
b"\x45\x0f\x10\x6a\x07\x50\xe8\x35\x45\x01\x00\x83\xc4\x0c\x8b\x83"
|
||||
b"\xf8\x00\x00\x00\x85\xc0\x74\x0d\x50\xe8\xa2\x42\x01\x00\x8b\xc8"
|
||||
b"\xe8\x9b\x9b\x03\x00\xbf\x01\x00\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2"
|
||||
b"\x04\x00\x8b\xff\x4a\x14\x00\x10\x5e\x14\x00\x10\x70\x14\x00\x10"
|
||||
b"\x8a\x14\x00\x10\x9c\x14\x00\x10\xca\x14\x00\x10\x00\x01\x05\x05"
|
||||
b"\x05\x05\x02\x05\x05\x05\x05\x05\x05\x05\x05\x05\x03\x05\x05\x05"
|
||||
b"\x05\x05\x05\x04\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
)
|
||||
|
||||
|
||||
def test_score_notify():
|
||||
"""Score::Notify function from 0x10001410 in LEGO1.
|
||||
Good representative function for jump table (at 0x100014d4)
|
||||
and switch data (at 0x100014ec)."""
|
||||
ig = InstructGen(SCORE_NOTIFY, 0x10001410)
|
||||
|
||||
# Did we get everything?
|
||||
assert len(ig.sections) == 3
|
||||
types_only = tuple(s.type for s in ig.sections)
|
||||
assert types_only == (SectionType.CODE, SectionType.ADDR_TAB, SectionType.DATA_TAB)
|
||||
|
||||
# CODE section stopped at correct place?
|
||||
instructions = ig.sections[0].contents
|
||||
assert instructions[-1].address == 0x100014D2
|
||||
# n.b. 0x100014d2 is the dummy instruction `mov edi, edi`
|
||||
# Ghidra does more thorough analysis and ignores this.
|
||||
# The last real instruction should be at 0x100014cf. Not a big deal
|
||||
# to include this because it is not junk data.
|
||||
|
||||
# 6 switch addresses
|
||||
assert len(ig.sections[1].contents) == 6
|
||||
|
||||
# TODO: The data table at the end includes all of the 0xCC padding bytes.
|
||||
|
||||
|
||||
SMACK_CASE = (
|
||||
# LEGO1: 0x100cdc43 (modified so jump table points at +0x1016)
|
||||
b"\x2e\xff\x24\x8d\x16\x10\x00\x00"
|
||||
# LEGO1: 0x100cdb62 (instructions before and after jump table)
|
||||
b"\x8b\xf8\xeb\x1a\x87\xdb\x87\xc9\x87\xdb\x87\xc9\x87\xdb\x50\xdc"
|
||||
b"\x0c\x10\xd0\xe2\x0c\x10\xb0\xe8\x0c\x10\x50\xe9\x0c\x10\xa0\x10"
|
||||
b"\x27\x10\x10\x3c\x11\x77\x17\x8a\xc8"
|
||||
)
|
||||
|
||||
|
||||
def test_smack_case():
|
||||
"""Case where we have code / jump table / code.
|
||||
Need to properly separate code sections, eliminate junk instructions
|
||||
and continue disassembling at the proper address following the data."""
|
||||
ig = InstructGen(SMACK_CASE, 0x1000)
|
||||
assert len(ig.sections) == 3
|
||||
assert ig.sections[0].type == ig.sections[2].type == SectionType.CODE
|
||||
|
||||
# Make sure we captured the instruction immediately after
|
||||
assert ig.sections[2].contents[0].mnemonic == "mov"
|
||||
|
||||
|
||||
# BETA10 0x1004c9cc
|
||||
BETA_FUNC = (
|
||||
b"\x55\x8b\xec\x83\xec\x08\x53\x56\x57\x89\x4d\xfc\x8b\x45\xfc\x33"
|
||||
b"\xc9\x8a\x88\x19\x02\x00\x00\x89\x4d\xf8\xe9\x1e\x00\x00\x00\xe9"
|
||||
b"\x41\x00\x00\x00\xe9\x3c\x00\x00\x00\xe9\x37\x00\x00\x00\xe9\x32"
|
||||
b"\x00\x00\x00\xe9\x2d\x00\x00\x00\xe9\x28\x00\x00\x00\x83\x7d\xf8"
|
||||
b"\x04\x0f\x87\x1e\x00\x00\x00\x8b\x45\xf8\xff\x24\x85\x1d\xca\x04"
|
||||
b"\x10\xeb\xc9\x04\x10\xf0\xc9\x04\x10\xf5\xc9\x04\x10\xfa\xc9\x04"
|
||||
b"\x10\xff\xc9\x04\x10\xb0\x01\xe9\x00\x00\x00\x00\x5f\x5e\x5b\xc9"
|
||||
b"\xc2\x04\x00"
|
||||
)
|
||||
|
||||
|
||||
def test_beta_case():
|
||||
"""Complete (and short) function with CODE / ADDR / CODE"""
|
||||
ig = InstructGen(BETA_FUNC, 0x1004C9CC)
|
||||
# The JMP into the jump table immediately precedes the jump table.
|
||||
# We have to detect this and switch sections correctly or we will only
|
||||
# get 1 section.
|
||||
assert len(ig.sections) == 3
|
||||
assert ig.sections[0].type == ig.sections[2].type == SectionType.CODE
|
||||
|
||||
# Make sure we captured the instruction immediately after
|
||||
assert ig.sections[2].contents[0].mnemonic == "mov"
|
||||
|
||||
|
||||
# LEGO1 0x1000fb50
|
||||
# TODO: The test data here is longer than it needs to be.
|
||||
THUNK_TEST = (
|
||||
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
b"\x56\x8b\xf1\xe8\xd8\xc5\x00\x00\x8b\xce\xe8\xb1\xdc\x01\x00\xf6"
|
||||
b"\x44\x24\x08\x01\x74\x0c\x8d\x46\xe0\x50\xe8\xe1\x66\x07\x00\x83"
|
||||
b"\xc4\x04\x8d\x46\xe0\x5e\xc2\x04\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
b"\xb8\x7c\x05\x0f\x10\xc3\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
b"\x8b\x54"
|
||||
# The problem is here: the last two bytes are the start of the next
|
||||
# function 0x1000fbc0. This is not enough data to read an instruction.
|
||||
)
|
||||
|
||||
|
||||
def test_thunk_case():
|
||||
"""Adjuster thunk incorrectly annotated.
|
||||
We are reading way more bytes than we should for this function."""
|
||||
ig = InstructGen(THUNK_TEST, 0x1000FB50)
|
||||
# No switch cases here, so the only section is code.
|
||||
# This caused an infinite loop during testing so the goal is just to finish.
|
||||
assert len(ig.sections) == 1
|
||||
|
||||
# TODO: We might detect the 0xCC padding bytes and cut off the function.
|
||||
# If we did that, we would correctly read only 2 instructions.
|
||||
# assert len(ig.sections[0].contents) == 2
|
||||
|
||||
|
||||
# LEGO1 0x1006f080, Infocenter::HandleEndAction
|
||||
HANDLE_END_ACTION = (
|
||||
b"\x53\x56\x57\x8b\xf1\x8b\x5c\x24\x10\x8b\x0d\x84\x45\x0f\x10\x8b"
|
||||
b"\x7b\x0c\x8b\x47\x20\x39\x01\x75\x29\x81\x7f\x1c\xf3\x01\x00\x00"
|
||||
b"\x75\x20\xe8\x59\x66\xfa\xff\x6a\x00\x8b\x40\x18\x6a\x00\x6a\x10"
|
||||
b"\x50\xff\x15\x38\xb5\x10\x10\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2"
|
||||
b"\x04\x00\x39\x46\x0c\x0f\x85\xa2\x00\x00\x00\x8b\x47\x1c\x83\xf8"
|
||||
b"\x28\x74\x18\x83\xf8\x29\x74\x13\x83\xf8\x2a\x74\x0e\x83\xf8\x2b"
|
||||
b"\x74\x09\x83\xf8\x2c\x0f\x85\x82\x00\x00\x00\x66\x8b\x86\xd4\x01"
|
||||
b"\x00\x00\x66\x85\xc0\x74\x09\x66\x48\x66\x89\x86\xd4\x01\x00\x00"
|
||||
b"\x66\x83\xbe\xd4\x01\x00\x00\x00\x75\x63\x6a\x0b\xe8\xff\x67\xfa"
|
||||
b"\xff\x66\x8b\x86\xfc\x00\x00\x00\x83\xc4\x04\x50\xe8\x3f\x66\xfa"
|
||||
b"\xff\x8b\xc8\xe8\x58\xa6\xfc\xff\x0f\xbf\x86\xfc\x00\x00\x00\x48"
|
||||
b"\x83\xf8\x04\x77\x2f\xff\x24\x85\x78\xf4\x06\x10\x68\x1d\x02\x00"
|
||||
b"\x00\xeb\x1a\x68\x1e\x02\x00\x00\xeb\x13\x68\x1f\x02\x00\x00\xeb"
|
||||
b"\x0c\x68\x20\x02\x00\x00\xeb\x05\x68\x21\x02\x00\x00\x8b\xce\xe8"
|
||||
b"\x9c\x21\x00\x00\x6a\x01\x8b\xce\xe8\x53\x1c\x00\x00\x8d\x8e\x0c"
|
||||
b"\x01\x00\x00\x53\x8b\x01\xff\x50\x04\x85\xc0\x0f\x85\xef\x02\x00"
|
||||
b"\x00\x8b\x56\x0c\x8b\x4f\x20\x3b\xd1\x74\x0e\x8b\x1d\x74\x45\x0f"
|
||||
b"\x10\x39\x0b\x0f\x85\xd7\x02\x00\x00\x81\x7f\x1c\x02\x02\x00\x00"
|
||||
b"\x75\x1a\x6a\x00\x52\x6a\x10\xe8\xa4\x65\xfa\xff\x8b\xc8\xe8\x0d"
|
||||
b"\xa2\xfb\xff\x66\xc7\x86\xd6\x01\x00\x00\x00\x00\x8b\x96\x00\x01"
|
||||
b"\x00\x00\x8d\x42\x74\x8b\x18\x83\xfb\x0c\x0f\x87\x9b\x02\x00\x00"
|
||||
b"\x33\xc9\x8a\x8b\xac\xf4\x06\x10\xff\x24\x8d\x8c\xf4\x06\x10\x8b"
|
||||
b"\x86\x08\x01\x00\x00\x83\xf8\x05\x77\x07\xff\x24\x85\xbc\xf4\x06"
|
||||
b"\x10\x8b\xce\xe8\xb8\x1a\x00\x00\x8b\x86\x00\x01\x00\x00\x68\xf4"
|
||||
b"\x01\x00\x00\x8b\xce\xc7\x40\x74\x0b\x00\x00\x00\xe8\xef\x20\x00"
|
||||
b"\x00\x8b\x86\x00\x01\x00\x00\xc7\x86\x08\x01\x00\x00\xff\xff\xff"
|
||||
b"\xff\x83\x78\x78\x00\x0f\x85\x40\x02\x00\x00\xb8\x01\x00\x00\x00"
|
||||
b"\x5f\x66\xc7\x86\xd2\x01\x00\x00\x01\x00\x5e\x5b\xc2\x04\x00\x6a"
|
||||
b"\x00\x8b\xce\x6a\x01\xe8\xd6\x19\x00\x00\xb8\x01\x00\x00\x00\x5f"
|
||||
b"\x5e\x5b\xc2\x04\x00\x6a\x01\x8b\xce\x6a\x02\xe8\xc0\x19\x00\x00"
|
||||
b"\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04\x00\x8b\xce\xe8\x3e\x1a"
|
||||
b"\x00\x00\x8b\x86\x00\x01\x00\x00\x68\x1c\x02\x00\x00\x8b\xce\xc7"
|
||||
b"\x40\x74\x0b\x00\x00\x00\xe8\x75\x20\x00\x00\xb8\x01\x00\x00\x00"
|
||||
b"\x5f\xc7\x86\x08\x01\x00\x00\xff\xff\xff\xff\x5e\x5b\xc2\x04\x00"
|
||||
b"\x8b\xce\xe8\x09\x1a\x00\x00\x8b\x86\x00\x01\x00\x00\x68\x1b\x02"
|
||||
b"\x00\x00\x8b\xce\xc7\x40\x74\x0b\x00\x00\x00\xe8\x40\x20\x00\x00"
|
||||
b"\xb8\x01\x00\x00\x00\x5f\xc7\x86\x08\x01\x00\x00\xff\xff\xff\xff"
|
||||
b"\x5e\x5b\xc2\x04\x00\xc7\x00\x0b\x00\x00\x00\x8b\x86\x08\x01\x00"
|
||||
b"\x00\x83\xf8\x04\x74\x0c\x83\xf8\x05\x74\x0e\x68\xf4\x01\x00\x00"
|
||||
b"\xeb\x0c\x68\x1c\x02\x00\x00\xeb\x05\x68\x1b\x02\x00\x00\x8b\xce"
|
||||
b"\xe8\xfb\x1f\x00\x00\xb8\x01\x00\x00\x00\x5f\xc7\x86\x08\x01\x00"
|
||||
b"\x00\xff\xff\xff\xff\x5e\x5b\xc2\x04\x00\x6a\x00\xa1\xa0\x76\x0f"
|
||||
b"\x10\x50\xe8\x39\x65\xfa\xff\x83\xc4\x08\xa1\xa4\x76\x0f\x10\x6a"
|
||||
b"\x00\x50\xe8\x29\x65\xfa\xff\x83\xc4\x08\xe8\xf1\x63\xfa\xff\x8b"
|
||||
b"\xc8\xe8\x6a\x02\x01\x00\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04"
|
||||
b"\x00\x8b\x47\x1c\x83\xf8\x46\x74\x09\x83\xf8\x47\x0f\x85\x09\x01"
|
||||
b"\x00\x00\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8\x91\x65\xfa\xff\x8b"
|
||||
b"\xc8\xe8\xfa\xc7\xfd\xff\x8b\x86\x00\x01\x00\x00\x5f\x5e\x5b\xc7"
|
||||
b"\x40\x74\x0e\x00\x00\x00\xb8\x01\x00\x00\x00\xc2\x04\x00\x8b\x47"
|
||||
b"\x1c\x39\x86\xf8\x00\x00\x00\x0f\x85\xce\x00\x00\x00\xe8\xbe\x63"
|
||||
b"\xfa\xff\x83\x78\x10\x02\x74\x19\x66\x8b\x86\xfc\x00\x00\x00\x66"
|
||||
b"\x85\xc0\x74\x0d\x50\xe8\xa6\x63\xfa\xff\x8b\xc8\xe8\xbf\xa3\xfc"
|
||||
b"\xff\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8\x32\x65\xfa\xff\x8b\xc8"
|
||||
b"\xe8\x9b\xc7\xfd\xff\x8b\x86\x00\x01\x00\x00\x5f\x5e\x5b\xc7\x40"
|
||||
b"\x74\x0e\x00\x00\x00\xb8\x01\x00\x00\x00\xc2\x04\x00\x83\x7a\x78"
|
||||
b"\x00\x75\x32\x8b\x86\xf8\x00\x00\x00\x83\xf8\x28\x74\x27\x83\xf8"
|
||||
b"\x29\x74\x22\x83\xf8\x2a\x74\x1d\x83\xf8\x2b\x74\x18\x83\xf8\x2c"
|
||||
b"\x74\x13\x66\xc7\x86\xd0\x01\x00\x00\x01\x00\x6a\x0b\xe8\xee\x64"
|
||||
b"\xfa\xff\x83\xc4\x04\x8b\x86\x00\x01\x00\x00\x6a\x01\x68\xdc\x44"
|
||||
b"\x0f\x10\xc7\x40\x74\x02\x00\x00\x00\xe8\x22\x64\xfa\xff\x83\xc4"
|
||||
b"\x08\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04\x00\x8b\x47\x1c\x39"
|
||||
b"\x86\xf8\x00\x00\x00\x75\x14\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8"
|
||||
b"\x9c\x64\xfa\xff\x8b\xc8\xe8\x05\xc7\xfd\xff\xb8\x01\x00\x00\x00"
|
||||
b"\x5f\x5e\x5b\xc2\x04\x00\x8b\xff\x3c\xf1\x06\x10\x43\xf1\x06\x10"
|
||||
b"\x4a\xf1\x06\x10\x51\xf1\x06\x10\x58\xf1\x06\x10\xdf\xf1\x06\x10"
|
||||
b"\xd5\xf2\x06\x10\x1a\xf3\x06\x10\x51\xf3\x06\x10\x8e\xf3\x06\x10"
|
||||
b"\xed\xf3\x06\x10\x4c\xf4\x06\x10\x6b\xf4\x06\x10\x00\x01\x02\x07"
|
||||
b"\x03\x04\x07\x07\x07\x07\x07\x05\x06\x8d\x49\x00\x3f\xf2\x06\x10"
|
||||
b"\x55\xf2\x06\x10\xf1\xf1\x06\x10\xf1\xf1\x06\x10\x6b\xf2\x06\x10"
|
||||
b"\xa0\xf2\x06\x10\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||
)
|
||||
|
||||
|
||||
def test_action_case():
|
||||
"""3 switches: 3 jump tables, 1 data table"""
|
||||
ig = InstructGen(HANDLE_END_ACTION, 0x1006F080)
|
||||
# Two of the jump tables (0x1006f478 with 5, 0x1006f48c with 8)
|
||||
# are contiguous.
|
||||
assert len(ig.sections) == 5
|
@ -1,152 +0,0 @@
|
||||
"""Tests for the Bin (or IsleBin) module that:
|
||||
1. Parses relevant data from the PE header and other structures.
|
||||
2. Provides an interface to read from the DLL or EXE using a virtual address.
|
||||
These are some basic smoke tests."""
|
||||
|
||||
import hashlib
|
||||
from typing import Tuple
|
||||
import pytest
|
||||
from isledecomp.bin import (
|
||||
Bin as IsleBin,
|
||||
SectionNotFoundError,
|
||||
InvalidVirtualAddressError,
|
||||
)
|
||||
|
||||
|
||||
# LEGO1.DLL: v1.1 English, September
|
||||
LEGO1_SHA256 = "14645225bbe81212e9bc1919cd8a692b81b8622abb6561280d99b0fc4151ce17"
|
||||
|
||||
|
||||
@pytest.fixture(name="binfile", scope="session")
|
||||
def fixture_binfile(pytestconfig) -> IsleBin:
|
||||
filename = pytestconfig.getoption("--lego1")
|
||||
|
||||
# Skip this if we have not provided the path to LEGO1.dll.
|
||||
if filename is None:
|
||||
pytest.skip(allow_module_level=True, reason="No path to LEGO1")
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
digest = hashlib.sha256(f.read()).hexdigest()
|
||||
if digest != LEGO1_SHA256:
|
||||
pytest.fail(reason="Did not match expected LEGO1.DLL")
|
||||
|
||||
with IsleBin(filename, find_str=True) as islebin:
|
||||
yield islebin
|
||||
|
||||
|
||||
def test_basic(binfile: IsleBin):
|
||||
assert binfile.entry == 0x1008C860
|
||||
assert len(binfile.sections) == 6
|
||||
|
||||
with pytest.raises(SectionNotFoundError):
|
||||
binfile.get_section_by_name(".hello")
|
||||
|
||||
|
||||
SECTION_INFO = (
|
||||
(".text", 0x10001000, 0xD2A66, 0xD2C00),
|
||||
(".rdata", 0x100D4000, 0x1B5B6, 0x1B600),
|
||||
(".data", 0x100F0000, 0x1A734, 0x12C00),
|
||||
(".idata", 0x1010B000, 0x1006, 0x1200),
|
||||
(".rsrc", 0x1010D000, 0x21D8, 0x2200),
|
||||
(".reloc", 0x10110000, 0x10C58, 0x10E00),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name, v_addr, v_size, raw_size", SECTION_INFO)
|
||||
def test_sections(name: str, v_addr: int, v_size: int, raw_size: int, binfile: IsleBin):
|
||||
section = binfile.get_section_by_name(name)
|
||||
assert section.virtual_address == v_addr
|
||||
assert section.virtual_size == v_size
|
||||
assert section.size_of_raw_data == raw_size
|
||||
|
||||
|
||||
DOUBLE_PI_BYTES = b"\x18\x2d\x44\x54\xfb\x21\x09\x40"
|
||||
|
||||
# Now that's a lot of pi
|
||||
PI_ADDRESSES = (
|
||||
0x100D4000,
|
||||
0x100D4700,
|
||||
0x100D7180,
|
||||
0x100DB8F0,
|
||||
0x100DC030,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("addr", PI_ADDRESSES)
|
||||
def test_read_pi(addr: int, binfile: IsleBin):
|
||||
assert binfile.read(addr, 8) == DOUBLE_PI_BYTES
|
||||
|
||||
|
||||
def test_unusual_reads(binfile: IsleBin):
|
||||
"""Reads that return an error or some specific value based on context"""
|
||||
# Reading an address earlier than the imagebase
|
||||
with pytest.raises(InvalidVirtualAddressError):
|
||||
binfile.read(0, 1)
|
||||
|
||||
# Really big address
|
||||
with pytest.raises(InvalidVirtualAddressError):
|
||||
binfile.read(0xFFFFFFFF, 1)
|
||||
|
||||
# Uninitialized part of .data
|
||||
assert binfile.read(0x1010A600, 4) is None
|
||||
|
||||
# Past the end of virtual size in .text
|
||||
assert binfile.read(0x100D3A70, 4) == b"\x00\x00\x00\x00"
|
||||
|
||||
|
||||
STRING_ADDRESSES = (
|
||||
(0x100DB588, b"November"),
|
||||
(0x100F0130, b"Helicopter"),
|
||||
(0x100F0144, b"HelicopterState"),
|
||||
(0x100F0BE4, b"valerie"),
|
||||
(0x100F4080, b"TARGET"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("addr, string", STRING_ADDRESSES)
|
||||
def test_strings(addr: int, string: bytes, binfile: IsleBin):
|
||||
"""Test string read utility function and the string search feature"""
|
||||
assert binfile.read_string(addr) == string
|
||||
assert binfile.find_string(string) == addr
|
||||
|
||||
|
||||
def test_relocation(binfile: IsleBin):
|
||||
# n.b. This is not the number of *relocations* read from .reloc.
|
||||
# It is the set of unique addresses in the binary that get relocated.
|
||||
assert len(binfile.get_relocated_addresses()) == 14066
|
||||
|
||||
# Score::Score is referenced only by CALL instructions. No need to relocate.
|
||||
assert binfile.is_relocated_addr(0x10001000) is False
|
||||
|
||||
# MxEntity::SetEntityId is in the vtable and must be relocated.
|
||||
assert binfile.is_relocated_addr(0x10001070) is True
|
||||
|
||||
|
||||
# Not sanitizing dll name case. Do we care?
|
||||
IMPORT_REFS = (
|
||||
("KERNEL32.dll", "CreateMutexA", 0x1010B3D0),
|
||||
("WINMM.dll", "midiOutPrepareHeader", 0x1010B550),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("import_ref", IMPORT_REFS)
|
||||
def test_imports(import_ref: Tuple[str, str, int], binfile: IsleBin):
|
||||
assert import_ref in binfile.imports
|
||||
|
||||
|
||||
# Location of the JMP instruction and the import address.
|
||||
THUNKS = (
|
||||
(0x100D3728, 0x1010B32C), # DirectDrawCreate
|
||||
(0x10098F9E, 0x1010B3D4), # RtlUnwind
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thunk_ref", THUNKS)
|
||||
def test_thunks(thunk_ref: Tuple[int, int], binfile: IsleBin):
|
||||
assert thunk_ref in binfile.thunks
|
||||
|
||||
|
||||
def test_exports(binfile: IsleBin):
|
||||
assert len(binfile.exports) == 130
|
||||
assert (0x1003BFB0, b"??0LegoBackgroundColor@@QAE@PBD0@Z") in binfile.exports
|
||||
assert (0x10091EE0, b"_DllMain@12") in binfile.exports
|
@ -1,144 +0,0 @@
|
||||
import pytest
|
||||
from isledecomp.parser import DecompLinter
|
||||
from isledecomp.parser.error import ParserError
|
||||
|
||||
|
||||
@pytest.fixture(name="linter")
|
||||
def fixture_linter():
|
||||
return DecompLinter()
|
||||
|
||||
|
||||
def test_simple_in_order(linter):
|
||||
lines = [
|
||||
"// FUNCTION: TEST 0x1000",
|
||||
"void function1() {}",
|
||||
"// FUNCTION: TEST 0x2000",
|
||||
"void function2() {}",
|
||||
"// FUNCTION: TEST 0x3000",
|
||||
"void function3() {}",
|
||||
]
|
||||
assert linter.check_lines(lines, "test.cpp", "TEST") is True
|
||||
|
||||
|
||||
def test_simple_not_in_order(linter):
|
||||
lines = [
|
||||
"// FUNCTION: TEST 0x1000",
|
||||
"void function1() {}",
|
||||
"// FUNCTION: TEST 0x3000",
|
||||
"void function3() {}",
|
||||
"// FUNCTION: TEST 0x2000",
|
||||
"void function2() {}",
|
||||
]
|
||||
assert linter.check_lines(lines, "test.cpp", "TEST") is False
|
||||
assert len(linter.alerts) == 1
|
||||
|
||||
assert linter.alerts[0].code == ParserError.FUNCTION_OUT_OF_ORDER
|
||||
# N.B. Line number given is the start of the function, not the marker
|
||||
assert linter.alerts[0].line_number == 6
|
||||
|
||||
|
||||
def test_byname_ignored(linter):
|
||||
"""Should ignore lookup-by-name markers when checking order."""
|
||||
lines = [
|
||||
"// FUNCTION: TEST 0x1000",
|
||||
"void function1() {}",
|
||||
"// FUNCTION: TEST 0x3000",
|
||||
"// MyClass::MyMethod",
|
||||
"// FUNCTION: TEST 0x2000",
|
||||
"void function2() {}",
|
||||
]
|
||||
# This will fail because byname lookup does not belong in the cpp file
|
||||
assert linter.check_lines(lines, "test.cpp", "TEST") is False
|
||||
# but it should not fail for function order.
|
||||
assert all(
|
||||
alert.code != ParserError.FUNCTION_OUT_OF_ORDER for alert in linter.alerts
|
||||
)
|
||||
|
||||
|
||||
def test_module_isolation(linter):
|
||||
"""Should check the order of markers from a single module only."""
|
||||
lines = [
|
||||
"// FUNCTION: ALPHA 0x0001",
|
||||
"// FUNCTION: TEST 0x1000",
|
||||
"void function1() {}",
|
||||
"// FUNCTION: ALPHA 0x0002",
|
||||
"// FUNCTION: TEST 0x2000",
|
||||
"void function2() {}",
|
||||
"// FUNCTION: ALPHA 0x0003",
|
||||
"// FUNCTION: TEST 0x3000",
|
||||
"void function3() {}",
|
||||
]
|
||||
|
||||
assert linter.check_lines(lines, "test.cpp", "TEST") is True
|
||||
linter.reset(True)
|
||||
assert linter.check_lines(lines, "test.cpp", "ALPHA") is True
|
||||
|
||||
|
||||
def test_byname_headers_only(linter):
|
||||
"""Markers that ar referenced by name with cvdump belong in header files only."""
|
||||
lines = [
|
||||
"// FUNCTION: TEST 0x1000",
|
||||
"// MyClass::~MyClass",
|
||||
]
|
||||
|
||||
assert linter.check_lines(lines, "test.h", "TEST") is True
|
||||
linter.reset(True)
|
||||
assert linter.check_lines(lines, "test.cpp", "TEST") is False
|
||||
assert linter.alerts[0].code == ParserError.BYNAME_FUNCTION_IN_CPP
|
||||
|
||||
|
||||
def test_duplicate_offsets(linter):
|
||||
"""The linter will retain module/offset pairs found until we do a full reset."""
|
||||
lines = [
|
||||
"// FUNCTION: TEST 0x1000",
|
||||
"// FUNCTION: HELLO 0x1000",
|
||||
"// MyClass::~MyClass",
|
||||
]
|
||||
|
||||
# Should not fail for duplicate offset 0x1000 because the modules are unique.
|
||||
assert linter.check_lines(lines, "test.h", "TEST") is True
|
||||
|
||||
# Simulate a failure by reading the same file twice.
|
||||
assert linter.check_lines(lines, "test.h", "TEST") is False
|
||||
|
||||
# Two errors because offsets from both modules are duplicated
|
||||
assert len(linter.alerts) == 2
|
||||
assert all(a.code == ParserError.DUPLICATE_OFFSET for a in linter.alerts)
|
||||
|
||||
# Partial reset will retain the list of seen offsets.
|
||||
linter.reset(False)
|
||||
assert linter.check_lines(lines, "test.h", "TEST") is False
|
||||
|
||||
# Full reset will forget seen offsets.
|
||||
linter.reset(True)
|
||||
assert linter.check_lines(lines, "test.h", "TEST") is True
|
||||
|
||||
|
||||
def test_duplicate_strings(linter):
|
||||
"""Duplicate string markers are okay if the string value is the same."""
|
||||
string_lines = [
|
||||
"// STRING: TEST 0x1000",
|
||||
'return "hello world";',
|
||||
]
|
||||
|
||||
# No problem to use this marker twice.
|
||||
assert linter.check_lines(string_lines, "test.h", "TEST") is True
|
||||
assert linter.check_lines(string_lines, "test.h", "TEST") is True
|
||||
|
||||
different_string = [
|
||||
"// STRING: TEST 0x1000",
|
||||
'return "hi there";',
|
||||
]
|
||||
|
||||
# Same address but the string is different
|
||||
assert linter.check_lines(different_string, "greeting.h", "TEST") is False
|
||||
assert len(linter.alerts) == 1
|
||||
assert linter.alerts[0].code == ParserError.WRONG_STRING
|
||||
|
||||
same_addr_reused = [
|
||||
"// GLOBAL:TEXT 0x1000",
|
||||
"int g_test = 123;",
|
||||
]
|
||||
|
||||
# This will fail like any other offset reuse.
|
||||
assert linter.check_lines(same_addr_reused, "other.h", "TEST") is False
|
@ -1,773 +0,0 @@
|
||||
import pytest
|
||||
from isledecomp.parser.parser import (
|
||||
ReaderState,
|
||||
DecompParser,
|
||||
)
|
||||
from isledecomp.parser.error import ParserError
|
||||
|
||||
|
||||
@pytest.fixture(name="parser")
|
||||
def fixture_parser():
|
||||
return DecompParser()
|
||||
|
||||
|
||||
def test_missing_sig(parser):
|
||||
"""In the hopefully rare scenario that the function signature and marker
|
||||
are swapped, we still have enough to match witch reccmp"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"void my_function()",
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"{",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
assert parser.state == ReaderState.SEARCH
|
||||
assert len(parser.functions) == 1
|
||||
assert parser.functions[0].line_number == 3
|
||||
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.MISSED_START_OF_FUNCTION
|
||||
|
||||
|
||||
def test_not_exact_syntax(parser):
|
||||
"""Alert to inexact syntax right here in the parser instead of kicking it downstream.
|
||||
Doing this means we don't have to save the actual text."""
|
||||
parser.read_line("// function: test 0x1234")
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER
|
||||
|
||||
|
||||
def test_invalid_marker(parser):
|
||||
"""We matched a decomp marker, but it's not one we care about"""
|
||||
parser.read_line("// BANANA: TEST 0x1234")
|
||||
assert parser.state == ReaderState.SEARCH
|
||||
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.BOGUS_MARKER
|
||||
|
||||
|
||||
def test_incompatible_marker(parser):
|
||||
"""The marker we just read cannot be handled in the current parser state"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"// GLOBAL: TEST 0x5000",
|
||||
]
|
||||
)
|
||||
assert parser.state == ReaderState.SEARCH
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
|
||||
|
||||
|
||||
def test_variable(parser):
|
||||
"""Should identify a global variable"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: HELLO 0x1234",
|
||||
"int g_value = 5;",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 1
|
||||
|
||||
|
||||
def test_synthetic_plus_marker(parser):
|
||||
"""Marker tracking preempts synthetic name detection.
|
||||
Should fail with error and not log the synthetic"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// SYNTHETIC: HEY 0x555",
|
||||
"// FUNCTION: HOWDY 0x1234",
|
||||
]
|
||||
)
|
||||
assert len(parser.functions) == 0
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
|
||||
|
||||
|
||||
def test_different_markers_different_module(parser):
|
||||
"""Does it make any sense for a function to be a stub in one module,
|
||||
but not in another? I don't know. But it's no problem for us."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: HOWDY 0x1234",
|
||||
"// STUB: SUP 0x5555",
|
||||
"void interesting_function() {",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.alerts) == 0
|
||||
assert len(parser.functions) == 2
|
||||
|
||||
|
||||
def test_different_markers_same_module(parser):
|
||||
"""Now, if something is a regular function but then a stub,
|
||||
what do we say about that?"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: HOWDY 0x1234",
|
||||
"// STUB: HOWDY 0x5555",
|
||||
"void interesting_function() {",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
|
||||
# Use first marker declaration, don't replace
|
||||
assert len(parser.functions) == 1
|
||||
assert parser.functions[0].should_skip() is False
|
||||
|
||||
# Should alert to this
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
|
||||
|
||||
|
||||
def test_unexpected_synthetic(parser):
|
||||
"""FUNCTION then SYNTHETIC should fail to report either one"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: HOWDY 0x1234",
|
||||
"// SYNTHETIC: HOWDY 0x5555",
|
||||
"void interesting_function() {",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
|
||||
assert parser.state == ReaderState.SEARCH
|
||||
assert len(parser.functions) == 0
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="not implemented yet")
|
||||
def test_duplicate_offset(parser):
|
||||
"""Repeating the same module/offset in the same file is probably a typo"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: HELLO 0x1234",
|
||||
"int x = 1;",
|
||||
"// GLOBAL: HELLO 0x1234",
|
||||
"int y = 2;",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET
|
||||
|
||||
|
||||
def test_multiple_variables(parser):
|
||||
"""Theoretically the same global variable can appear in multiple modules"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: HELLO 0x1234",
|
||||
"// GLOBAL: WUZZUP 0x555",
|
||||
"const char *g_greeting;",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 0
|
||||
assert len(parser.variables) == 2
|
||||
|
||||
|
||||
def test_multiple_variables_same_module(parser):
|
||||
"""Should not overwrite offset"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: HELLO 0x1234",
|
||||
"// GLOBAL: HELLO 0x555",
|
||||
"const char *g_greeting;",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].offset == 0x1234
|
||||
|
||||
|
||||
def test_multiple_vtables(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"// VTABLE: HELLO 0x1234",
|
||||
"// VTABLE: TEST 0x5432",
|
||||
"class MxString : public MxCore {",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 0
|
||||
assert len(parser.vtables) == 2
|
||||
assert parser.vtables[0].name == "MxString"
|
||||
|
||||
|
||||
def test_multiple_vtables_same_module(parser):
|
||||
"""Should not overwrite offset"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// VTABLE: HELLO 0x1234",
|
||||
"// VTABLE: HELLO 0x5432",
|
||||
"class MxString : public MxCore {",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
|
||||
assert len(parser.vtables) == 1
|
||||
assert parser.vtables[0].offset == 0x1234
|
||||
|
||||
|
||||
def test_synthetic(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"// SYNTHETIC: TEST 0x1234",
|
||||
"// TestClass::TestMethod",
|
||||
]
|
||||
)
|
||||
assert len(parser.functions) == 1
|
||||
assert parser.functions[0].lookup_by_name is True
|
||||
assert parser.functions[0].name == "TestClass::TestMethod"
|
||||
|
||||
|
||||
def test_synthetic_same_module(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"// SYNTHETIC: TEST 0x1234",
|
||||
"// SYNTHETIC: TEST 0x555",
|
||||
"// TestClass::TestMethod",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
|
||||
assert len(parser.functions) == 1
|
||||
assert parser.functions[0].offset == 0x1234
|
||||
|
||||
|
||||
def test_synthetic_no_comment(parser):
|
||||
"""Synthetic marker followed by a code line (i.e. non-comment)"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// SYNTHETIC: TEST 0x1234",
|
||||
"int x = 123;",
|
||||
]
|
||||
)
|
||||
assert len(parser.functions) == 0
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.BAD_NAMEREF
|
||||
assert parser.state == ReaderState.SEARCH
|
||||
|
||||
|
||||
def test_single_line_function(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"int hello() { return 1234; }",
|
||||
]
|
||||
)
|
||||
assert len(parser.functions) == 1
|
||||
assert parser.functions[0].line_number == 2
|
||||
assert parser.functions[0].end_line == 2
|
||||
|
||||
|
||||
def test_indented_function(parser):
|
||||
"""Track the number of whitespace characters when we begin the function
|
||||
and check that against each closing curly brace we read.
|
||||
Should not report a syntax warning if the function is indented"""
|
||||
parser.read_lines(
|
||||
[
|
||||
" // FUNCTION: TEST 0x1234",
|
||||
" void indented()",
|
||||
" {",
|
||||
" // TODO",
|
||||
" }",
|
||||
" // FUNCTION: NEXT 0x555",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="todo")
|
||||
def test_indented_no_curly_hint(parser):
|
||||
"""Same as above, but opening curly brace is on the same line.
|
||||
Without the hint of how many whitespace characters to check, can we
|
||||
still identify the end of the function?"""
|
||||
parser.read_lines(
|
||||
[
|
||||
" // FUNCTION: TEST 0x1234",
|
||||
" void indented() {",
|
||||
" }",
|
||||
" // FUNCTION: NEXT 0x555",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
|
||||
def test_implicit_lookup_by_name(parser):
|
||||
"""FUNCTION (or STUB) offsets must directly precede the function signature.
|
||||
If we detect a comment instead, we assume that this is a lookup-by-name
|
||||
function and end here."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"// TestClass::TestMethod()",
|
||||
]
|
||||
)
|
||||
assert parser.state == ReaderState.SEARCH
|
||||
assert len(parser.functions) == 1
|
||||
assert parser.functions[0].lookup_by_name is True
|
||||
assert parser.functions[0].name == "TestClass::TestMethod()"
|
||||
|
||||
|
||||
def test_function_with_spaces(parser):
|
||||
"""There should not be any spaces between the end of FUNCTION markers
|
||||
and the start or name of the function. If it's a blank line, we can safely
|
||||
ignore but should alert to this."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
" ",
|
||||
"inline void test_function() { };",
|
||||
]
|
||||
)
|
||||
assert len(parser.functions) == 1
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
|
||||
|
||||
|
||||
def test_function_with_spaces_implicit(parser):
|
||||
"""Same as above, but for implicit lookup-by-name"""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
" ",
|
||||
"// Implicit::Method",
|
||||
]
|
||||
)
|
||||
assert len(parser.functions) == 1
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="will assume implicit lookup-by-name function")
|
||||
def test_function_is_commented(parser):
|
||||
"""In an ideal world, we would recognize that there is no code here.
|
||||
Some editors (or users) might comment the function on each line like this
|
||||
but hopefully it is rare."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"// int my_function()",
|
||||
"// {",
|
||||
"// return 5;",
|
||||
"// }",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.functions) == 0
|
||||
|
||||
|
||||
def test_unexpected_eof(parser):
|
||||
"""If a decomp marker finds its way to the last line of the file,
|
||||
report that we could not get anything from it."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"// Cls::Method",
|
||||
"// FUNCTION: TEST 0x5555",
|
||||
]
|
||||
)
|
||||
parser.finish()
|
||||
|
||||
assert len(parser.functions) == 1
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.UNEXPECTED_END_OF_FILE
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no longer applies")
|
||||
def test_global_variable_prefix(parser):
|
||||
"""Global and static variables should have the g_ prefix."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
'const char* g_msg = "hello";',
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 1
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEXT 0x5555",
|
||||
"int test = 5;",
|
||||
]
|
||||
)
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.GLOBAL_MISSING_PREFIX
|
||||
# In spite of that, we should still grab the variable name.
|
||||
assert parser.variables[1].name == "test"
|
||||
|
||||
|
||||
def test_global_nomatch(parser):
|
||||
"""We do our best to grab the variable name, even without the g_ prefix
|
||||
but this (by design) will not match everything."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"FunctionCall();",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 0
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.NO_SUITABLE_NAME
|
||||
|
||||
|
||||
def test_static_variable(parser):
|
||||
"""We can detect whether a variable is a static function variable
|
||||
based on the parser's state when we detect it.
|
||||
Checking for the word `static` alone is not a good test.
|
||||
Static class variables are filed as S_GDATA32, same as regular globals."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"int g_test = 1234;",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].is_static is False
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x5555",
|
||||
"void test_function() {",
|
||||
"// GLOBAL: TEST 0x8888",
|
||||
"static int g_internal = 0;",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 2
|
||||
assert parser.variables[1].is_static is True
|
||||
|
||||
|
||||
def test_reject_global_return(parser):
|
||||
"""Previously we had annotated strings with the GLOBAL marker.
|
||||
For example: if a function returned a string. We now want these to be
|
||||
annotated with the STRING marker."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x5555",
|
||||
"void test_function() {",
|
||||
" // GLOBAL: TEST 0x8888",
|
||||
' return "test";',
|
||||
"}",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 0
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.GLOBAL_NOT_VARIABLE
|
||||
|
||||
|
||||
def test_global_string(parser):
|
||||
"""We now allow GLOBAL and STRING markers for the same item."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"// STRING: TEXT 0x5555",
|
||||
'char* g_test = "hello";',
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 1
|
||||
assert len(parser.strings) == 1
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
assert parser.variables[0].name == "g_test"
|
||||
assert parser.strings[0].name == "hello"
|
||||
|
||||
|
||||
def test_comment_variables(parser):
|
||||
"""Match on hidden variables from libraries."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"// g_test",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].name == "g_test"
|
||||
|
||||
|
||||
def test_flexible_variable_prefix(parser):
|
||||
"""Don't alert to library variables that lack the g_ prefix.
|
||||
This is out of our control."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"// some_other_variable",
|
||||
]
|
||||
)
|
||||
assert len(parser.variables) == 1
|
||||
assert len(parser.alerts) == 0
|
||||
assert parser.variables[0].name == "some_other_variable"
|
||||
|
||||
|
||||
def test_string_ignore_g_prefix(parser):
|
||||
"""String annotations above a regular variable should not alert to
|
||||
the missing g_ prefix. This is only required for GLOBAL markers."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// STRING: TEST 0x1234",
|
||||
'const char* value = "";',
|
||||
]
|
||||
)
|
||||
assert len(parser.strings) == 1
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
|
||||
def test_class_variable(parser):
|
||||
"""We should accurately name static variables that are class members."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"class Test {",
|
||||
"protected:",
|
||||
" // GLOBAL: TEST 0x1234",
|
||||
" static int g_test;",
|
||||
"};",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].name == "Test::g_test"
|
||||
|
||||
|
||||
def test_namespace_variable(parser):
|
||||
"""We should identify a namespace surrounding any global variables"""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"namespace Test {",
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"int g_test = 1234;",
|
||||
"}",
|
||||
"// GLOBAL: TEST 0x5555",
|
||||
"int g_second = 2;",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.variables) == 2
|
||||
assert parser.variables[0].name == "Test::g_test"
|
||||
assert parser.variables[1].name == "g_second"
|
||||
|
||||
|
||||
def test_namespace_vtable(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"namespace Tgl {",
|
||||
"// VTABLE: TEST 0x1234",
|
||||
"class Renderer {",
|
||||
"};",
|
||||
"}",
|
||||
"// VTABLE: TEST 0x5555",
|
||||
"class Hello { };",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.vtables) == 2
|
||||
assert parser.vtables[0].name == "Tgl::Renderer"
|
||||
assert parser.vtables[1].name == "Hello"
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no longer applies")
|
||||
def test_global_prefix_namespace(parser):
|
||||
"""Should correctly identify namespaces before checking for the g_ prefix"""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"class Test {",
|
||||
" // GLOBAL: TEST 0x1234",
|
||||
" static int g_count = 0;",
|
||||
" // GLOBAL: TEST 0x5555",
|
||||
" static int count = 0;",
|
||||
"};",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.variables) == 2
|
||||
assert parser.variables[0].name == "Test::g_count"
|
||||
assert parser.variables[1].name == "Test::count"
|
||||
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.GLOBAL_MISSING_PREFIX
|
||||
|
||||
|
||||
def test_nested_namespace(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"namespace Tgl {",
|
||||
"class Renderer {",
|
||||
" // GLOBAL: TEST 0x1234",
|
||||
" static int g_count = 0;",
|
||||
"};",
|
||||
"};",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].name == "Tgl::Renderer::g_count"
|
||||
|
||||
|
||||
def test_match_qualified_variable(parser):
|
||||
"""If a variable belongs to a scope and we use a fully qualified reference
|
||||
below a GLOBAL marker, make sure we capture the full name."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x1234",
|
||||
"int MxTest::g_count = 0;",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].name == "MxTest::g_count"
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
|
||||
def test_static_variable_parent(parser):
|
||||
"""Report the address of the parent function that contains a static variable."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: TEST 0x1234",
|
||||
"void test()",
|
||||
"{",
|
||||
" // GLOBAL: TEST 0x5555",
|
||||
" static int g_count = 0;",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.variables) == 1
|
||||
assert parser.variables[0].is_static is True
|
||||
assert parser.variables[0].parent_function == 0x1234
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="""Without the FUNCTION marker we don't know that we are inside a function,
|
||||
so we do not identify this variable as static."""
|
||||
)
|
||||
def test_static_variable_no_parent(parser):
|
||||
"""If the function that contains a static variable is not marked, we
|
||||
cannot match it with cvdump so we should skip it and report an error."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"void test()",
|
||||
"{",
|
||||
" // GLOBAL: TEST 0x5555",
|
||||
" static int g_count = 0;",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
|
||||
# No way to match this variable so don't report it
|
||||
assert len(parser.variables) == 0
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.ORPHANED_STATIC_VARIABLE
|
||||
|
||||
|
||||
def test_static_variable_incomplete_coverage(parser):
|
||||
"""If the function that contains a static variable is marked, but
|
||||
not for each module used for the variable itself, this is an error."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: HELLO 0x1234",
|
||||
"void test()",
|
||||
"{",
|
||||
" // GLOBAL: HELLO 0x5555",
|
||||
" // GLOBAL: TEST 0x5555",
|
||||
" static int g_count = 0;",
|
||||
"}",
|
||||
]
|
||||
)
|
||||
|
||||
# Match for HELLO module
|
||||
assert len(parser.variables) == 1
|
||||
|
||||
# Failed for TEST module
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.ORPHANED_STATIC_VARIABLE
|
||||
|
||||
|
||||
def test_header_function_declaration(parser):
|
||||
"""This is either a forward reference or a declaration in a header file.
|
||||
Meaning: The implementation is not here. This is not the correct place
|
||||
for the FUNCTION marker and it will probably not match anything."""
|
||||
|
||||
parser.read_lines(
|
||||
[
|
||||
"// FUNCTION: HELLO 0x1234",
|
||||
"void sample_function(int);",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.alerts) == 1
|
||||
assert parser.alerts[0].code == ParserError.NO_IMPLEMENTATION
|
||||
|
||||
|
||||
def test_extra(parser):
|
||||
"""Allow a fourth field in the decomp annotation. Its use will vary
|
||||
depending on the marker type. Currently this is only used to identify
|
||||
a vtable with virtual inheritance."""
|
||||
|
||||
# Intentionally using non-vtable markers here.
|
||||
# We might want to emit a parser warning for unnecessary extra info.
|
||||
parser.read_lines(
|
||||
[
|
||||
"// GLOBAL: TEST 0x5555 Haha",
|
||||
"int g_variable = 0;",
|
||||
"// FUNCTION: TEST 0x1234 Something",
|
||||
"void Test() { g_variable++; }",
|
||||
"// LIBRARY: TEST 0x8080 Printf",
|
||||
"// _printf",
|
||||
]
|
||||
)
|
||||
|
||||
# We don't use this information (yet) but this is all fine.
|
||||
assert len(parser.alerts) == 0
|
||||
|
||||
|
||||
def test_virtual_inheritance(parser):
|
||||
"""Indicate the base class for a vtable where the class uses
|
||||
virtual inheritance."""
|
||||
parser.read_lines(
|
||||
[
|
||||
"// VTABLE: HELLO 0x1234",
|
||||
"// VTABLE: HELLO 0x1238 Greetings",
|
||||
"// VTABLE: HELLO 0x123c Howdy",
|
||||
"class HiThere : public virtual Greetings {",
|
||||
"};",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.alerts) == 0
|
||||
assert len(parser.vtables) == 3
|
||||
assert parser.vtables[0].base_class is None
|
||||
assert parser.vtables[1].base_class == "Greetings"
|
||||
assert parser.vtables[2].base_class == "Howdy"
|
||||
assert all(v.name == "HiThere" for v in parser.vtables)
|
||||
|
||||
|
||||
def test_namespace_in_comment(parser):
|
||||
parser.read_lines(
|
||||
[
|
||||
"// VTABLE: HELLO 0x1234",
|
||||
"// class Tgl::Object",
|
||||
"// VTABLE: HELLO 0x5555",
|
||||
"// class TglImpl::RendererImpl<D3DRMImpl::D3DRM>",
|
||||
]
|
||||
)
|
||||
|
||||
assert len(parser.vtables) == 2
|
||||
assert parser.vtables[0].name == "Tgl::Object"
|
||||
assert parser.vtables[1].name == "TglImpl::RendererImpl<D3DRMImpl::D3DRM>"
|
@ -1,141 +0,0 @@
|
||||
import os
|
||||
from typing import List, TextIO
|
||||
import pytest
|
||||
from isledecomp.parser import DecompParser
|
||||
from isledecomp.parser.node import ParserSymbol
|
||||
|
||||
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
|
||||
|
||||
|
||||
def sample_file(filename: str) -> TextIO:
|
||||
"""Wrapper for opening the samples from the directory that does not
|
||||
depend on the cwd where we run the test"""
|
||||
full_path = os.path.join(SAMPLE_DIR, filename)
|
||||
return open(full_path, "r", encoding="utf-8")
|
||||
|
||||
|
||||
def code_blocks_are_sorted(blocks: List[ParserSymbol]) -> bool:
|
||||
"""Helper to make this more idiomatic"""
|
||||
just_offsets = [block.offset for block in blocks]
|
||||
return just_offsets == sorted(just_offsets)
|
||||
|
||||
|
||||
@pytest.fixture(name="parser")
|
||||
def fixture_parser():
|
||||
return DecompParser()
|
||||
|
||||
|
||||
# Tests are below #
|
||||
|
||||
|
||||
def test_sanity(parser):
|
||||
"""Read a very basic file"""
|
||||
with sample_file("basic_file.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 3
|
||||
assert code_blocks_are_sorted(parser.functions) is True
|
||||
# n.b. The parser returns line numbers as 1-based
|
||||
# Function starts when we see the opening curly brace
|
||||
assert parser.functions[0].line_number == 8
|
||||
assert parser.functions[0].end_line == 10
|
||||
|
||||
|
||||
def test_oneline(parser):
|
||||
"""(Assuming clang-format permits this) This sample has a function
|
||||
on a single line. This will test the end-of-function detection"""
|
||||
with sample_file("oneline_function.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 2
|
||||
assert parser.functions[0].line_number == 5
|
||||
assert parser.functions[0].end_line == 5
|
||||
|
||||
|
||||
def test_missing_offset(parser):
|
||||
"""What if the function doesn't have an offset comment?"""
|
||||
with sample_file("missing_offset.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
# TODO: For now, the function without the offset will just be ignored.
|
||||
# Would be the same outcome if the comment was present but mangled and
|
||||
# we failed to match it. We should detect these cases in the future.
|
||||
assert len(parser.functions) == 1
|
||||
|
||||
|
||||
def test_jumbled_case(parser):
|
||||
"""The parser just reports what it sees. It is the responsibility of
|
||||
the downstream tools to do something about a jumbled file.
|
||||
Just verify that we are reading it correctly."""
|
||||
with sample_file("out_of_order.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 3
|
||||
assert code_blocks_are_sorted(parser.functions) is False
|
||||
|
||||
|
||||
def test_bad_file(parser):
|
||||
with sample_file("poorly_formatted.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 3
|
||||
|
||||
|
||||
def test_indented(parser):
|
||||
"""Offsets for functions inside of a class will probably be indented."""
|
||||
with sample_file("basic_class.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
# TODO: We don't properly detect the end of these functions
|
||||
# because the closing brace is indented. However... knowing where each
|
||||
# function ends is less important (for now) than capturing
|
||||
# all the functions that are there.
|
||||
|
||||
assert len(parser.functions) == 2
|
||||
assert parser.functions[0].offset == int("0x12345678", 16)
|
||||
assert parser.functions[0].line_number == 16
|
||||
# assert parser.functions[0].end_line == 19
|
||||
|
||||
assert parser.functions[1].offset == int("0xdeadbeef", 16)
|
||||
assert parser.functions[1].line_number == 23
|
||||
# assert parser.functions[1].end_line == 25
|
||||
|
||||
|
||||
def test_inline(parser):
|
||||
with sample_file("inline.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 2
|
||||
for fun in parser.functions:
|
||||
assert fun.line_number is not None
|
||||
assert fun.line_number == fun.end_line
|
||||
|
||||
|
||||
def test_multiple_offsets(parser):
|
||||
"""If multiple offset marks appear before for a code block, take them
|
||||
all but ensure module name (case-insensitive) is distinct.
|
||||
Use first module occurrence in case of duplicates."""
|
||||
with sample_file("multiple_offsets.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 4
|
||||
assert parser.functions[0].module == "TEST"
|
||||
assert parser.functions[0].line_number == 9
|
||||
|
||||
assert parser.functions[1].module == "HELLO"
|
||||
assert parser.functions[1].line_number == 9
|
||||
|
||||
# Duplicate modules are ignored
|
||||
assert parser.functions[2].line_number == 16
|
||||
assert parser.functions[2].offset == 0x2345
|
||||
|
||||
assert parser.functions[3].module == "TEST"
|
||||
assert parser.functions[3].offset == 0x2002
|
||||
|
||||
|
||||
def test_variables(parser):
|
||||
with sample_file("global_variables.cpp") as f:
|
||||
parser.read_lines(f)
|
||||
|
||||
assert len(parser.functions) == 1
|
||||
assert len(parser.variables) == 2
|
@ -1,141 +0,0 @@
|
||||
from typing import Optional
|
||||
import pytest
|
||||
from isledecomp.parser.parser import (
|
||||
ReaderState as _rs,
|
||||
DecompParser,
|
||||
)
|
||||
from isledecomp.parser.error import ParserError as _pe
|
||||
|
||||
# fmt: off
|
||||
state_change_marker_cases = [
|
||||
(_rs.SEARCH, "FUNCTION", _rs.WANT_SIG, None),
|
||||
(_rs.SEARCH, "GLOBAL", _rs.IN_GLOBAL, None),
|
||||
(_rs.SEARCH, "STUB", _rs.WANT_SIG, None),
|
||||
(_rs.SEARCH, "SYNTHETIC", _rs.IN_SYNTHETIC, None),
|
||||
(_rs.SEARCH, "TEMPLATE", _rs.IN_TEMPLATE, None),
|
||||
(_rs.SEARCH, "VTABLE", _rs.IN_VTABLE, None),
|
||||
(_rs.SEARCH, "LIBRARY", _rs.IN_LIBRARY, None),
|
||||
(_rs.SEARCH, "STRING", _rs.IN_GLOBAL, None),
|
||||
|
||||
(_rs.WANT_SIG, "FUNCTION", _rs.WANT_SIG, None),
|
||||
(_rs.WANT_SIG, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.WANT_SIG, "STUB", _rs.WANT_SIG, None),
|
||||
(_rs.WANT_SIG, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.WANT_SIG, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.WANT_SIG, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.WANT_SIG, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.WANT_SIG, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
|
||||
(_rs.IN_FUNC, "FUNCTION", _rs.WANT_SIG, _pe.MISSED_END_OF_FUNCTION),
|
||||
(_rs.IN_FUNC, "GLOBAL", _rs.IN_FUNC_GLOBAL, None),
|
||||
(_rs.IN_FUNC, "STUB", _rs.WANT_SIG, _pe.MISSED_END_OF_FUNCTION),
|
||||
(_rs.IN_FUNC, "SYNTHETIC", _rs.IN_SYNTHETIC, _pe.MISSED_END_OF_FUNCTION),
|
||||
(_rs.IN_FUNC, "TEMPLATE", _rs.IN_TEMPLATE, _pe.MISSED_END_OF_FUNCTION),
|
||||
(_rs.IN_FUNC, "VTABLE", _rs.IN_VTABLE, _pe.MISSED_END_OF_FUNCTION),
|
||||
(_rs.IN_FUNC, "LIBRARY", _rs.IN_LIBRARY, _pe.MISSED_END_OF_FUNCTION),
|
||||
(_rs.IN_FUNC, "STRING", _rs.IN_FUNC_GLOBAL, None),
|
||||
|
||||
(_rs.IN_TEMPLATE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_TEMPLATE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_TEMPLATE, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_TEMPLATE, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_TEMPLATE, "TEMPLATE", _rs.IN_TEMPLATE, None),
|
||||
(_rs.IN_TEMPLATE, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_TEMPLATE, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_TEMPLATE, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
|
||||
(_rs.WANT_CURLY, "FUNCTION", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "GLOBAL", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "STUB", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "SYNTHETIC", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "TEMPLATE", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "VTABLE", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "LIBRARY", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
(_rs.WANT_CURLY, "STRING", _rs.SEARCH, _pe.UNEXPECTED_MARKER),
|
||||
|
||||
(_rs.IN_GLOBAL, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_GLOBAL, "GLOBAL", _rs.IN_GLOBAL, None),
|
||||
(_rs.IN_GLOBAL, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_GLOBAL, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_GLOBAL, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_GLOBAL, "STRING", _rs.IN_GLOBAL, None),
|
||||
|
||||
(_rs.IN_FUNC_GLOBAL, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_FUNC_GLOBAL, "GLOBAL", _rs.IN_FUNC_GLOBAL, None),
|
||||
(_rs.IN_FUNC_GLOBAL, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_FUNC_GLOBAL, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_FUNC_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_FUNC_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_FUNC_GLOBAL, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_FUNC_GLOBAL, "STRING", _rs.IN_FUNC_GLOBAL, None),
|
||||
|
||||
(_rs.IN_VTABLE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_VTABLE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_VTABLE, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_VTABLE, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_VTABLE, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_VTABLE, "VTABLE", _rs.IN_VTABLE, None),
|
||||
(_rs.IN_VTABLE, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_VTABLE, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
|
||||
(_rs.IN_SYNTHETIC, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_SYNTHETIC, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_SYNTHETIC, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_SYNTHETIC, "SYNTHETIC", _rs.IN_SYNTHETIC, None),
|
||||
(_rs.IN_SYNTHETIC, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_SYNTHETIC, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_SYNTHETIC, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_SYNTHETIC, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
|
||||
(_rs.IN_LIBRARY, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_LIBRARY, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_LIBRARY, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_LIBRARY, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_LIBRARY, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_LIBRARY, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
(_rs.IN_LIBRARY, "LIBRARY", _rs.IN_LIBRARY, None),
|
||||
(_rs.IN_LIBRARY, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"state, marker_type, new_state, expected_error", state_change_marker_cases
|
||||
)
|
||||
def test_state_change_by_marker(
|
||||
state: _rs, marker_type: str, new_state: _rs, expected_error: Optional[_pe]
|
||||
):
|
||||
p = DecompParser()
|
||||
p.state = state
|
||||
mock_line = f"// {marker_type}: TEST 0x1234"
|
||||
p.read_line(mock_line)
|
||||
assert p.state == new_state
|
||||
|
||||
if expected_error is not None:
|
||||
assert len(p.alerts) > 0
|
||||
assert p.alerts[0].code == expected_error
|
||||
|
||||
|
||||
# Reading any of these lines should have no effect in ReaderState.SEARCH
|
||||
search_lines_no_effect = [
|
||||
"",
|
||||
"\t",
|
||||
" ",
|
||||
"int x = 0;",
|
||||
"// Comment",
|
||||
"/*",
|
||||
"*/",
|
||||
"/* Block comment */",
|
||||
"{",
|
||||
"}",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line", search_lines_no_effect)
|
||||
def test_state_search_line(line: str):
|
||||
p = DecompParser()
|
||||
p.read_line(line)
|
||||
assert p.state == _rs.SEARCH
|
||||
assert len(p.alerts) == 0
|
@ -1,209 +0,0 @@
|
||||
import pytest
|
||||
from isledecomp.parser.parser import MarkerDict
|
||||
from isledecomp.parser.marker import (
|
||||
DecompMarker,
|
||||
MarkerType,
|
||||
match_marker,
|
||||
is_marker_exact,
|
||||
)
|
||||
from isledecomp.parser.util import (
|
||||
is_blank_or_comment,
|
||||
get_class_name,
|
||||
get_variable_name,
|
||||
get_string_contents,
|
||||
)
|
||||
|
||||
|
||||
blank_or_comment_param = [
|
||||
(True, ""),
|
||||
(True, "\t"),
|
||||
(True, " "),
|
||||
(False, "\tint abc=123;"),
|
||||
(True, "// OFFSET: LEGO1 0xdeadbeef"),
|
||||
(True, " /* Block comment beginning"),
|
||||
(True, "Block comment ending */ "),
|
||||
# TODO: does clang-format have anything to say about these cases?
|
||||
(False, "x++; // Comment folows"),
|
||||
(False, "x++; /* Block comment begins"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expected, line", blank_or_comment_param)
|
||||
def test_is_blank_or_comment(line: str, expected: bool):
|
||||
assert is_blank_or_comment(line) is expected
|
||||
|
||||
|
||||
marker_samples = [
|
||||
# (can_parse: bool, exact_match: bool, line: str)
|
||||
(True, True, "// FUNCTION: LEGO1 0xdeadbeef"),
|
||||
(True, True, "// FUNCTION: ISLE 0x12345678"),
|
||||
# No trailing spaces allowed
|
||||
(True, False, "// FUNCTION: LEGO1 0xdeadbeef "),
|
||||
# Must have exactly one space between elements
|
||||
(True, False, "//FUNCTION: ISLE 0xdeadbeef"),
|
||||
(True, False, "// FUNCTION:ISLE 0xdeadbeef"),
|
||||
(True, False, "// FUNCTION: ISLE 0xdeadbeef"),
|
||||
(True, False, "// FUNCTION: ISLE 0xdeadbeef"),
|
||||
(True, False, "// FUNCTION: ISLE 0xdeadbeef"),
|
||||
# Must have 0x prefix for hex number to match at all
|
||||
(False, False, "// FUNCTION: ISLE deadbeef"),
|
||||
# Offset, module name, and STUB must be uppercase
|
||||
(True, False, "// function: ISLE 0xdeadbeef"),
|
||||
(True, False, "// function: isle 0xdeadbeef"),
|
||||
# Hex string must be lowercase
|
||||
(True, False, "// FUNCTION: ISLE 0xDEADBEEF"),
|
||||
# TODO: How flexible should we be with matching the module name?
|
||||
(True, True, "// FUNCTION: OMNI 0x12345678"),
|
||||
(True, True, "// FUNCTION: LEG01 0x12345678"),
|
||||
(True, False, "// FUNCTION: hello 0x12345678"),
|
||||
# Not close enough to match
|
||||
(False, False, "// FUNCTION: ISLE0x12345678"),
|
||||
(False, False, "// FUNCTION: 0x12345678"),
|
||||
(False, False, "// LEGO1: 0x12345678"),
|
||||
# Hex string shorter than 8 characters
|
||||
(True, True, "// FUNCTION: LEGO1 0x1234"),
|
||||
# TODO: These match but shouldn't.
|
||||
# (False, False, '// FUNCTION: LEGO1 0'),
|
||||
# (False, False, '// FUNCTION: LEGO1 0x'),
|
||||
# Extra field
|
||||
(True, True, "// VTABLE: HELLO 0x1234 Extra"),
|
||||
# Extra with spaces
|
||||
(True, True, "// VTABLE: HELLO 0x1234 Whatever<SubClass *>"),
|
||||
# Extra, no space (if the first non-hex character is not in [a-f])
|
||||
(True, False, "// VTABLE: HELLO 0x1234Hello"),
|
||||
# Extra, many spaces
|
||||
(True, False, "// VTABLE: HELLO 0x1234 Hello"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("match, _, line", marker_samples)
|
||||
def test_marker_match(line: str, match: bool, _):
|
||||
did_match = match_marker(line) is not None
|
||||
assert did_match is match
|
||||
|
||||
|
||||
@pytest.mark.parametrize("_, exact, line", marker_samples)
|
||||
def test_marker_exact(line: str, exact: bool, _):
|
||||
assert is_marker_exact(line) is exact
|
||||
|
||||
|
||||
def test_marker_dict_simple():
|
||||
d = MarkerDict()
|
||||
d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
|
||||
markers = list(d.iter())
|
||||
assert len(markers) == 1
|
||||
|
||||
|
||||
def test_marker_dict_ofs_replace():
|
||||
d = MarkerDict()
|
||||
d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
|
||||
d.insert(DecompMarker("FUNCTION", "TEST", 0x555))
|
||||
markers = list(d.iter())
|
||||
assert len(markers) == 1
|
||||
assert markers[0].offset == 0x1234
|
||||
|
||||
|
||||
def test_marker_dict_type_replace():
|
||||
d = MarkerDict()
|
||||
d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
|
||||
d.insert(DecompMarker("STUB", "TEST", 0x1234))
|
||||
markers = list(d.iter())
|
||||
assert len(markers) == 1
|
||||
assert markers[0].type == MarkerType.FUNCTION
|
||||
|
||||
|
||||
class_name_match_cases = [
|
||||
("struct MxString {", "MxString"),
|
||||
("class MxString {", "MxString"),
|
||||
("// class MxString", "MxString"),
|
||||
("class MxString : public MxCore {", "MxString"),
|
||||
("class MxPtrList<MxPresenter>", "MxPtrList<MxPresenter>"),
|
||||
# If it is possible to match the symbol MxList<LegoPathController *>::`vftable'
|
||||
# we should get the correct class name if possible. If the template type is a pointer,
|
||||
# the asterisk and class name are separated by one space.
|
||||
("// class MxList<LegoPathController *>", "MxList<LegoPathController *>"),
|
||||
("// class MxList<LegoPathController*>", "MxList<LegoPathController *>"),
|
||||
("// class MxList<LegoPathController* >", "MxList<LegoPathController *>"),
|
||||
# I don't know if this would ever come up, but sure, why not?
|
||||
("// class MxList<LegoPathController**>", "MxList<LegoPathController **>"),
|
||||
("// class Many::Name::Spaces", "Many::Name::Spaces"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line, class_name", class_name_match_cases)
|
||||
def test_get_class_name(line: str, class_name: str):
|
||||
assert get_class_name(line) == class_name
|
||||
|
||||
|
||||
class_name_no_match_cases = [
|
||||
"MxString { ",
|
||||
"clas MxString",
|
||||
"// MxPtrList<MxPresenter>::`scalar deleting destructor'",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line", class_name_no_match_cases)
|
||||
def test_get_class_name_none(line: str):
|
||||
assert get_class_name(line) is None
|
||||
|
||||
|
||||
variable_name_cases = [
|
||||
# with prefix for easy access
|
||||
("char* g_test;", "g_test"),
|
||||
("g_test;", "g_test"),
|
||||
("void (*g_test)(int);", "g_test"),
|
||||
("char g_test[50];", "g_test"),
|
||||
("char g_test[50] = {1234,", "g_test"),
|
||||
("int g_test = 500;", "g_test"),
|
||||
# no prefix
|
||||
("char* hello;", "hello"),
|
||||
("hello;", "hello"),
|
||||
("void (*hello)(int);", "hello"),
|
||||
("char hello[50];", "hello"),
|
||||
("char hello[50] = {1234,", "hello"),
|
||||
("int hello = 500;", "hello"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line,name", variable_name_cases)
|
||||
def test_get_variable_name(line: str, name: str):
|
||||
assert get_variable_name(line) == name
|
||||
|
||||
|
||||
string_match_cases = [
|
||||
('return "hello world";', "hello world"),
|
||||
('"hello\\\\"', "hello\\"),
|
||||
('"hello \\"world\\""', 'hello "world"'),
|
||||
('"hello\\nworld"', "hello\nworld"),
|
||||
# Only match first string if there are multiple options
|
||||
('Method("hello", "world");', "hello"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("line, string", string_match_cases)
|
||||
def test_get_string_contents(line: str, string: str):
|
||||
assert get_string_contents(line) == string
|
||||
|
||||
|
||||
def test_marker_extra_spaces():
|
||||
"""The extra field can contain spaces"""
|
||||
marker = match_marker("// VTABLE: TEST 0x1234 S p a c e s")
|
||||
assert marker.extra == "S p a c e s"
|
||||
|
||||
# Trailing spaces removed
|
||||
marker = match_marker("// VTABLE: TEST 0x8888 spaces ")
|
||||
assert marker.extra == "spaces"
|
||||
|
||||
# Trailing newline removed if present
|
||||
marker = match_marker("// VTABLE: TEST 0x5555 newline\n")
|
||||
assert marker.extra == "newline"
|
||||
|
||||
|
||||
def test_marker_trailing_spaces():
|
||||
"""Should ignore trailing spaces. (Invalid extra field)
|
||||
Offset field not truncated, extra field set to None."""
|
||||
|
||||
marker = match_marker("// VTABLE: TEST 0x1234 ")
|
||||
assert marker is not None
|
||||
assert marker.offset == 0x1234
|
||||
assert marker.extra is None
|
@ -1,32 +0,0 @@
|
||||
from os import name as os_name
|
||||
import pytest
|
||||
from isledecomp.dir import PathResolver
|
||||
|
||||
|
||||
if os_name != "nt":
|
||||
pytest.skip(reason="Skip Windows-only tests", allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.fixture(name="resolver")
|
||||
def fixture_resolver_win():
|
||||
yield PathResolver("C:\\isle")
|
||||
|
||||
|
||||
def test_identity(resolver):
|
||||
assert resolver.resolve_cvdump("C:\\isle\\test.h") == "C:\\isle\\test.h"
|
||||
|
||||
|
||||
def test_outside_basedir(resolver):
|
||||
assert resolver.resolve_cvdump("C:\\lego\\test.h") == "C:\\lego\\test.h"
|
||||
|
||||
|
||||
def test_relative(resolver):
|
||||
assert resolver.resolve_cvdump(".\\test.h") == "C:\\isle\\test.h"
|
||||
assert resolver.resolve_cvdump("..\\test.h") == "C:\\test.h"
|
||||
|
||||
|
||||
def test_intermediate_relative(resolver):
|
||||
"""These paths may not register as `relative` paths, but we want to
|
||||
produce a single absolute path for each."""
|
||||
assert resolver.resolve_cvdump("C:\\isle\\test\\..\\test.h") == "C:\\isle\\test.h"
|
||||
assert resolver.resolve_cvdump(".\\subdir\\..\\test.h") == "C:\\isle\\test.h"
|
@ -1,69 +0,0 @@
|
||||
from os import name as os_name
|
||||
from unittest.mock import patch
|
||||
import pytest
|
||||
from isledecomp.dir import PathResolver
|
||||
|
||||
|
||||
if os_name == "nt":
|
||||
pytest.skip(reason="Skip Posix-only tests", allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.fixture(name="resolver")
|
||||
def fixture_resolver_posix():
|
||||
# Skip the call to winepath by using a patch, although this is not strictly necessary.
|
||||
with patch("isledecomp.dir.winepath_unix_to_win", return_value="Z:\\usr\\isle"):
|
||||
yield PathResolver("/usr/isle")
|
||||
|
||||
|
||||
@patch("isledecomp.dir.winepath_win_to_unix")
|
||||
def test_identity(winepath_mock, resolver):
|
||||
"""Test with an absolute Wine path where a path swap is possible."""
|
||||
# In this and upcoming tests, patch is_file so we always assume there is
|
||||
# a file at the given unix path. We want to test the conversion logic only.
|
||||
with patch("pathlib.Path.is_file", return_value=True):
|
||||
assert resolver.resolve_cvdump("Z:\\usr\\isle\\test.h") == "/usr/isle/test.h"
|
||||
winepath_mock.assert_not_called()
|
||||
|
||||
# Without the patch, this should call the winepath_mock, but we have
|
||||
# memoized the value from the previous run.
|
||||
assert resolver.resolve_cvdump("Z:\\usr\\isle\\test.h") == "/usr/isle/test.h"
|
||||
winepath_mock.assert_not_called()
|
||||
|
||||
|
||||
@patch("isledecomp.dir.winepath_win_to_unix")
|
||||
def test_file_does_not_exist(winepath_mock, resolver):
|
||||
"""These test files (probably) don't exist, so we always assume
|
||||
the path swap failed and defer to winepath."""
|
||||
resolver.resolve_cvdump("Z:\\usr\\isle\\test.h")
|
||||
winepath_mock.assert_called_once_with("Z:\\usr\\isle\\test.h")
|
||||
|
||||
|
||||
@patch("isledecomp.dir.winepath_win_to_unix")
|
||||
def test_outside_basedir(winepath_mock, resolver):
|
||||
"""Test an absolute path where we cannot do a path swap."""
|
||||
with patch("pathlib.Path.is_file", return_value=True):
|
||||
resolver.resolve_cvdump("Z:\\lego\\test.h")
|
||||
winepath_mock.assert_called_once_with("Z:\\lego\\test.h")
|
||||
|
||||
|
||||
@patch("isledecomp.dir.winepath_win_to_unix")
|
||||
def test_relative(winepath_mock, resolver):
|
||||
"""Test relative paths inside and outside of the base dir."""
|
||||
with patch("pathlib.Path.is_file", return_value=True):
|
||||
assert resolver.resolve_cvdump("./test.h") == "/usr/isle/test.h"
|
||||
|
||||
# This works because we will resolve "/usr/isle/test/../test.h"
|
||||
assert resolver.resolve_cvdump("../test.h") == "/usr/test.h"
|
||||
winepath_mock.assert_not_called()
|
||||
|
||||
|
||||
@patch("isledecomp.dir.winepath_win_to_unix")
|
||||
def test_intermediate_relative(winepath_mock, resolver):
|
||||
"""We can resolve intermediate backdirs if they are relative to the basedir."""
|
||||
with patch("pathlib.Path.is_file", return_value=True):
|
||||
assert (
|
||||
resolver.resolve_cvdump("Z:\\usr\\isle\\test\\..\\test.h")
|
||||
== "/usr/isle/test.h"
|
||||
)
|
||||
assert resolver.resolve_cvdump(".\\subdir\\..\\test.h") == "/usr/isle/test.h"
|
||||
winepath_mock.assert_not_called()
|
@ -1,296 +0,0 @@
|
||||
from typing import Optional
|
||||
import pytest
|
||||
from isledecomp.compare.asm.parse import DisasmLiteInst, ParseAsm
|
||||
|
||||
|
||||
def mock_inst(mnemonic: str, op_str: str) -> DisasmLiteInst:
|
||||
"""Mock up the named tuple DisasmLite from just a mnemonic and op_str.
|
||||
To be used for tests on sanitize that do not require the instruction address
|
||||
or size. i.e. any non-jump instruction."""
|
||||
return DisasmLiteInst(0, 0, mnemonic, op_str)
|
||||
|
||||
|
||||
identity_cases = [
|
||||
("", ""),
|
||||
("sti", ""),
|
||||
("push", "ebx"),
|
||||
("ret", ""),
|
||||
("ret", "4"),
|
||||
("mov", "eax, 0x1234"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mnemonic, op_str", identity_cases)
|
||||
def test_identity(mnemonic, op_str):
|
||||
"""Confirm that nothing is substituted."""
|
||||
p = ParseAsm()
|
||||
inst = mock_inst(mnemonic, op_str)
|
||||
result = p.sanitize(inst)
|
||||
assert result == (mnemonic, op_str)
|
||||
|
||||
|
||||
ptr_replace_cases = [
|
||||
("byte ptr [0x5555]", "byte ptr [<OFFSET1>]"),
|
||||
("word ptr [0x5555]", "word ptr [<OFFSET1>]"),
|
||||
("dword ptr [0x5555]", "dword ptr [<OFFSET1>]"),
|
||||
("qword ptr [0x5555]", "qword ptr [<OFFSET1>]"),
|
||||
("eax, dword ptr [0x5555]", "eax, dword ptr [<OFFSET1>]"),
|
||||
("dword ptr [0x5555], eax", "dword ptr [<OFFSET1>], eax"),
|
||||
("dword ptr [0x5555], 0", "dword ptr [<OFFSET1>], 0"),
|
||||
("dword ptr [0x5555], 8", "dword ptr [<OFFSET1>], 8"),
|
||||
# Same value, assumed to be an addr in the first appearance
|
||||
# because it is designated as 'ptr', but we have not provided the
|
||||
# relocation table lookup method so we do not replace the second appearance.
|
||||
("dword ptr [0x5555], 0x5555", "dword ptr [<OFFSET1>], 0x5555"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start, end", ptr_replace_cases)
|
||||
def test_ptr_replace(start, end):
|
||||
"""Anything in square brackets (with the 'ptr' prefix) will always be replaced."""
|
||||
p = ParseAsm()
|
||||
inst = mock_inst("", start)
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == end
|
||||
|
||||
|
||||
call_replace_cases = [
|
||||
("ebx", "ebx"),
|
||||
("0x1234", "<OFFSET1>"),
|
||||
("dword ptr [0x1234]", "dword ptr [<OFFSET1>]"),
|
||||
("dword ptr [ecx + 0x10]", "dword ptr [ecx + 0x10]"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start, end", call_replace_cases)
|
||||
def test_call_replace(start, end):
|
||||
"""Call with hex operand is always replaced.
|
||||
Otherwise, ptr replacement rules apply, but skip `this` calls."""
|
||||
p = ParseAsm()
|
||||
inst = mock_inst("call", start)
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == end
|
||||
|
||||
|
||||
def test_jump_displacement():
|
||||
"""Display jump displacement (offset from end of jump instruction)
|
||||
instead of destination address."""
|
||||
p = ParseAsm()
|
||||
inst = DisasmLiteInst(0x1000, 2, "je", "0x1000")
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == "-0x2"
|
||||
|
||||
|
||||
def test_jmp_table():
|
||||
"""To ignore cases where it would be inappropriate to replace pointer
|
||||
displacement (i.e. the vast majority of them) we require the address
|
||||
to be relocated. This excludes any address less than the imagebase."""
|
||||
p = ParseAsm()
|
||||
inst = mock_inst("jmp", "dword ptr [eax*4 + 0x5555]")
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
# i.e. no change
|
||||
assert op_str == "dword ptr [eax*4 + 0x5555]"
|
||||
|
||||
def relocate_lookup(addr: int) -> bool:
|
||||
return addr == 0x5555
|
||||
|
||||
# Now add the relocation lookup
|
||||
p = ParseAsm(relocate_lookup=relocate_lookup)
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
# Should replace it now
|
||||
assert op_str == "dword ptr [eax*4 + <OFFSET1>]"
|
||||
|
||||
|
||||
name_replace_cases = [
|
||||
("byte ptr [0x5555]", "byte ptr [_substitute_]"),
|
||||
("word ptr [0x5555]", "word ptr [_substitute_]"),
|
||||
("dword ptr [0x5555]", "dword ptr [_substitute_]"),
|
||||
("qword ptr [0x5555]", "qword ptr [_substitute_]"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("start, end", name_replace_cases)
|
||||
def test_name_replace(start, end):
|
||||
"""Make sure the name lookup function is called if present"""
|
||||
|
||||
def substitute(_: int, __: bool) -> str:
|
||||
return "_substitute_"
|
||||
|
||||
p = ParseAsm(name_lookup=substitute)
|
||||
inst = mock_inst("mov", start)
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == end
|
||||
|
||||
|
||||
def test_replacement_cache():
|
||||
p = ParseAsm()
|
||||
inst = mock_inst("inc", "dword ptr [0x1234]")
|
||||
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == "dword ptr [<OFFSET1>]"
|
||||
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == "dword ptr [<OFFSET1>]"
|
||||
|
||||
|
||||
def test_replacement_numbering():
|
||||
"""If we can use the name lookup for the first address but not the second,
|
||||
the second replacement should be <OFFSET2> not <OFFSET1>."""
|
||||
|
||||
def substitute_1234(addr: int, _: bool) -> Optional[str]:
|
||||
return "_substitute_" if addr == 0x1234 else None
|
||||
|
||||
p = ParseAsm(name_lookup=substitute_1234)
|
||||
|
||||
(_, op_str) = p.sanitize(mock_inst("inc", "dword ptr [0x1234]"))
|
||||
assert op_str == "dword ptr [_substitute_]"
|
||||
|
||||
(_, op_str) = p.sanitize(mock_inst("inc", "dword ptr [0x5555]"))
|
||||
assert op_str == "dword ptr [<OFFSET2>]"
|
||||
|
||||
|
||||
def test_relocate_lookup():
|
||||
"""Immediate values would be relocated if they are actually addresses.
|
||||
So we can use the relocation table to check whether a given value is an
|
||||
address or just some number."""
|
||||
|
||||
def relocate_lookup(addr: int) -> bool:
|
||||
return addr == 0x1234
|
||||
|
||||
p = ParseAsm(relocate_lookup=relocate_lookup)
|
||||
(_, op_str) = p.sanitize(mock_inst("mov", "eax, 0x1234"))
|
||||
assert op_str == "eax, <OFFSET1>"
|
||||
|
||||
(_, op_str) = p.sanitize(mock_inst("mov", "eax, 0x5555"))
|
||||
assert op_str == "eax, 0x5555"
|
||||
|
||||
|
||||
def test_jump_to_function():
|
||||
"""A jmp instruction can lead us directly to a function. This can be found
|
||||
in the unwind section at the end of a function. However: we do not want to
|
||||
assume this is the case for all jumps. Only replace the jump with a name
|
||||
if we can find it using our lookup."""
|
||||
|
||||
def substitute_1234(addr: int, _: bool) -> Optional[str]:
|
||||
return "_substitute_" if addr == 0x1234 else None
|
||||
|
||||
p = ParseAsm(name_lookup=substitute_1234)
|
||||
inst = DisasmLiteInst(0x1000, 2, "jmp", "0x1234")
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == "_substitute_"
|
||||
|
||||
# Should not replace this jump.
|
||||
# 0x1000 (start addr)
|
||||
# + 2 (size of jump instruction)
|
||||
# + 0x5555 (displacement, the value we want)
|
||||
# = 0x6557
|
||||
inst = DisasmLiteInst(0x1000, 2, "jmp", "0x6557")
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == "0x5555"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="changed implementation")
|
||||
def test_float_replacement():
|
||||
"""Floating point constants often appear as pointers to data.
|
||||
A good example is ViewROI::IntrinsicImportance and the subclass override
|
||||
LegoROI::IntrinsicImportance. Both return 0.5, but this is done via the
|
||||
FLD instruction and a dword value at 0x100dbdec. In this case it is more
|
||||
valuable to just read the constant value rather than use a placeholder.
|
||||
The float constants don't appear to be deduplicated (like strings are)
|
||||
because there is another 0.5 at 0x100d40b0."""
|
||||
|
||||
def bin_lookup(addr: int, _: int) -> Optional[bytes]:
|
||||
return b"\xdb\x0f\x49\x40" if addr == 0x1234 else None
|
||||
|
||||
p = ParseAsm(bin_lookup=bin_lookup)
|
||||
inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]")
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
# Single-precision float. struct.unpack("<f", struct.pack("<f", math.pi))
|
||||
assert op_str == "dword ptr [3.1415927410125732 (FLOAT)]"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="changed implementation")
|
||||
def test_float_variable():
|
||||
"""If there is a variable at the address referenced by a float instruction,
|
||||
use the name instead of calling into the float replacement handler."""
|
||||
|
||||
def name_lookup(addr: int, _: bool) -> Optional[str]:
|
||||
return "g_myFloatVariable" if addr == 0x1234 else None
|
||||
|
||||
p = ParseAsm(name_lookup=name_lookup)
|
||||
inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]")
|
||||
(_, op_str) = p.sanitize(inst)
|
||||
assert op_str == "dword ptr [g_myFloatVariable]"
|
||||
|
||||
|
||||
def test_pointer_compare():
|
||||
"""A loop on an array could get optimized into comparing on the address
|
||||
that immediately follows the array. This may or may not be a valid address
|
||||
and it may or may not be annotated. To avoid a situation where an
|
||||
erroneous address value would get replaced with a placeholder and silently
|
||||
pass the comparison check, we will only replace an immediate value on the
|
||||
CMP instruction if it is a known address."""
|
||||
|
||||
# 0x1234 and 0x5555 are relocated and so are considered to be addresses.
|
||||
def relocate_lookup(addr: int) -> bool:
|
||||
return addr in (0x1234, 0x5555)
|
||||
|
||||
# Only 0x5555 is a "known" address
|
||||
def name_lookup(addr: int, _: bool) -> Optional[str]:
|
||||
return "hello" if addr == 0x5555 else None
|
||||
|
||||
p = ParseAsm(relocate_lookup=relocate_lookup, name_lookup=name_lookup)
|
||||
|
||||
# Will always replace on MOV instruction
|
||||
(_, op_str) = p.sanitize(mock_inst("mov", "eax, 0x1234"))
|
||||
assert op_str == "eax, <OFFSET1>"
|
||||
(_, op_str) = p.sanitize(mock_inst("mov", "eax, 0x5555"))
|
||||
assert op_str == "eax, hello"
|
||||
|
||||
# n.b. We have already cached the replacement for 0x1234, but the
|
||||
# special handling for CMP should skip the cache and not use it.
|
||||
|
||||
# Do not replace here
|
||||
(_, op_str) = p.sanitize(mock_inst("cmp", "eax, 0x1234"))
|
||||
assert op_str == "eax, 0x1234"
|
||||
# Should replace here
|
||||
(_, op_str) = p.sanitize(mock_inst("cmp", "eax, 0x5555"))
|
||||
assert op_str == "eax, hello"
|
||||
|
||||
|
||||
def test_absolute_indirect():
|
||||
"""The instruction `call dword ptr [0x1234]` means we call the function
|
||||
whose address is at 0x1234. (i.e. absolute indirect addressing mode)
|
||||
It is probably more useful to show the name of the function itself if
|
||||
we have it, but there are some circumstances where we want to replace
|
||||
with the pointer's name (i.e. an import function)."""
|
||||
|
||||
def name_lookup(addr: int, _: bool) -> Optional[str]:
|
||||
return {
|
||||
0x1234: "Hello",
|
||||
0x4321: "xyz",
|
||||
0x5555: "Test",
|
||||
}.get(addr)
|
||||
|
||||
def bin_lookup(addr: int, _: int) -> Optional[bytes]:
|
||||
return (
|
||||
{
|
||||
0x1234: b"\x55\x55\x00\x00",
|
||||
0x4321: b"\x99\x99\x00\x00",
|
||||
}
|
||||
).get(addr)
|
||||
|
||||
p = ParseAsm(name_lookup=name_lookup, bin_lookup=bin_lookup)
|
||||
|
||||
# If we know the indirect address (0x5555)
|
||||
# Arrow to indicate this is an indirect replacement
|
||||
(_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x1234]"))
|
||||
assert op_str == "dword ptr [->Test]"
|
||||
|
||||
# If we do not know the indirect address (0x9999)
|
||||
(_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x4321]"))
|
||||
assert op_str == "dword ptr [xyz]"
|
||||
|
||||
# If we can't read the indirect address
|
||||
(_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x5555]"))
|
||||
assert op_str == "dword ptr [Test]"
|
@ -1,867 +0,0 @@
|
||||
// reccmp.js
|
||||
/* global data */
|
||||
|
||||
// Unwrap array of functions into a dictionary with address as the key.
|
||||
const dataDict = Object.fromEntries(data.map(row => [row.address, row]));
|
||||
|
||||
function getDataByAddr(addr) {
|
||||
return dataDict[addr];
|
||||
}
|
||||
|
||||
//
|
||||
// Pure functions
|
||||
//
|
||||
|
||||
function formatAsm(entries, addrOption) {
|
||||
const output = [];
|
||||
|
||||
const createTh = (text) => {
|
||||
const th = document.createElement('th');
|
||||
th.innerText = text;
|
||||
return th;
|
||||
};
|
||||
|
||||
const createTd = (text, className = '') => {
|
||||
const td = document.createElement('td');
|
||||
td.innerText = text;
|
||||
td.className = className;
|
||||
return td;
|
||||
};
|
||||
|
||||
entries.forEach(obj => {
|
||||
// These won't all be present. You get "both" for an equal node
|
||||
// and orig/recomp for a diff.
|
||||
const { both = [], orig = [], recomp = [] } = obj;
|
||||
|
||||
output.push(...both.map(([addr, line, recompAddr]) => {
|
||||
const tr = document.createElement('tr');
|
||||
tr.appendChild(createTh(addr));
|
||||
tr.appendChild(createTh(recompAddr));
|
||||
tr.appendChild(createTd(line));
|
||||
return tr;
|
||||
}));
|
||||
|
||||
output.push(...orig.map(([addr, line]) => {
|
||||
const tr = document.createElement('tr');
|
||||
tr.appendChild(createTh(addr));
|
||||
tr.appendChild(createTh(''));
|
||||
tr.appendChild(createTd(`-${line}`, 'diffneg'));
|
||||
return tr;
|
||||
}));
|
||||
|
||||
output.push(...recomp.map(([addr, line]) => {
|
||||
const tr = document.createElement('tr');
|
||||
tr.appendChild(createTh(''));
|
||||
tr.appendChild(createTh(addr));
|
||||
tr.appendChild(createTd(`+${line}`, 'diffpos'));
|
||||
return tr;
|
||||
}));
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
// Special internal values to ensure this sort order for matching column:
|
||||
// 1. Stub
|
||||
// 2. Any match percentage [0.0, 1.0)
|
||||
// 3. Effective match
|
||||
// 4. Actual 100% match
|
||||
function matchingColAdjustment(row) {
|
||||
if ('stub' in row) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ('effective' in row) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (row.matching === 1.0) {
|
||||
return 1000;
|
||||
}
|
||||
|
||||
return row.matching;
|
||||
}
|
||||
|
||||
function getCppClass(str) {
|
||||
const idx = str.indexOf('::');
|
||||
if (idx !== -1) {
|
||||
return str.slice(0, idx);
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
// Clamp string length to specified length and pad with ellipsis
|
||||
function stringTruncate(str, maxlen = 20) {
|
||||
str = getCppClass(str);
|
||||
if (str.length > maxlen) {
|
||||
return `${str.slice(0, maxlen)}...`;
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
function getMatchPercentText(row) {
|
||||
if ('stub' in row) {
|
||||
return 'stub';
|
||||
}
|
||||
|
||||
if ('effective' in row) {
|
||||
return '100.00%*';
|
||||
}
|
||||
|
||||
return (row.matching * 100).toFixed(2) + '%';
|
||||
}
|
||||
|
||||
function countDiffs(row) {
|
||||
const { diff = '' } = row;
|
||||
if (diff === '') {
|
||||
return '';
|
||||
}
|
||||
|
||||
const diffs = diff.map(([slug, subgroups]) => subgroups).flat();
|
||||
const diffLength = diffs.filter(d => !('both' in d)).length;
|
||||
const diffWord = diffLength === 1 ? 'diff' : 'diffs';
|
||||
return diffLength === 0 ? '' : `${diffLength} ${diffWord}`;
|
||||
}
|
||||
|
||||
// Helper for this set/remove attribute block
|
||||
function setBooleanAttribute(element, attribute, value) {
|
||||
if (value) {
|
||||
element.setAttribute(attribute, '');
|
||||
} else {
|
||||
element.removeAttribute(attribute);
|
||||
}
|
||||
}
|
||||
|
||||
function copyToClipboard(value) {
|
||||
navigator.clipboard.writeText(value);
|
||||
}
|
||||
|
||||
const PAGE_SIZE = 200;
|
||||
|
||||
//
|
||||
// Global state
|
||||
//
|
||||
|
||||
class ListingState {
|
||||
constructor() {
|
||||
this._query = '';
|
||||
this._sortCol = 'address';
|
||||
this._filterType = 1;
|
||||
this._sortDesc = false;
|
||||
this._hidePerfect = false;
|
||||
this._hideStub = false;
|
||||
this._showRecomp = false;
|
||||
this._expanded = {};
|
||||
this._page = 0;
|
||||
|
||||
this._listeners = [];
|
||||
|
||||
this._results = [];
|
||||
this.updateResults();
|
||||
}
|
||||
|
||||
addListener(fn) {
|
||||
this._listeners.push(fn);
|
||||
}
|
||||
|
||||
callListeners() {
|
||||
for (const fn of this._listeners) {
|
||||
fn();
|
||||
}
|
||||
}
|
||||
|
||||
isExpanded(addr) {
|
||||
return addr in this._expanded;
|
||||
}
|
||||
|
||||
toggleExpanded(addr) {
|
||||
this.setExpanded(addr, !this.isExpanded(addr));
|
||||
}
|
||||
|
||||
setExpanded(addr, value) {
|
||||
if (value) {
|
||||
this._expanded[addr] = true;
|
||||
} else {
|
||||
delete this._expanded[addr];
|
||||
}
|
||||
}
|
||||
|
||||
updateResults() {
|
||||
const filterFn = this.rowFilterFn.bind(this);
|
||||
const sortFn = this.rowSortFn.bind(this);
|
||||
|
||||
this._results = data.filter(filterFn).sort(sortFn);
|
||||
|
||||
// Set _page directly to avoid double call to listeners.
|
||||
this._page = this.pageClamp(this.page);
|
||||
this.callListeners();
|
||||
}
|
||||
|
||||
pageSlice() {
|
||||
return this._results.slice(this.page * PAGE_SIZE, (this.page + 1) * PAGE_SIZE);
|
||||
}
|
||||
|
||||
resultsCount() {
|
||||
return this._results.length;
|
||||
}
|
||||
|
||||
pageCount() {
|
||||
return Math.ceil(this._results.length / PAGE_SIZE);
|
||||
}
|
||||
|
||||
maxPage() {
|
||||
return Math.max(0, this.pageCount() - 1);
|
||||
}
|
||||
|
||||
// A list showing the range of each page based on the sort column and direction.
|
||||
pageHeadings() {
|
||||
if (this._results.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const headings = [];
|
||||
|
||||
for (let i = 0; i < this.pageCount(); i++) {
|
||||
const startIdx = i * PAGE_SIZE;
|
||||
const endIdx = Math.min(this._results.length, ((i + 1) * PAGE_SIZE)) - 1;
|
||||
|
||||
let start = this._results[startIdx][this.sortCol];
|
||||
let end = this._results[endIdx][this.sortCol];
|
||||
|
||||
if (this.sortCol === 'matching') {
|
||||
start = getMatchPercentText(this._results[startIdx]);
|
||||
end = getMatchPercentText(this._results[endIdx]);
|
||||
}
|
||||
|
||||
headings.push([i, stringTruncate(start), stringTruncate(end)]);
|
||||
}
|
||||
|
||||
return headings;
|
||||
}
|
||||
|
||||
rowFilterFn(row) {
|
||||
// Destructuring sets defaults for optional values from this object.
|
||||
const {
|
||||
effective = false,
|
||||
stub = false,
|
||||
diff = '',
|
||||
name,
|
||||
address,
|
||||
matching
|
||||
} = row;
|
||||
|
||||
if (this.hidePerfect && (effective || matching >= 1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.hideStub && stub) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.query === '') {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Name/addr search
|
||||
if (this.filterType === 1) {
|
||||
return (
|
||||
address.includes(this.query) ||
|
||||
name.toLowerCase().includes(this.query)
|
||||
);
|
||||
}
|
||||
|
||||
// no diff for review.
|
||||
if (diff === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// special matcher for combined diff
|
||||
const anyLineMatch = ([addr, line]) => line.toLowerCase().trim().includes(this.query);
|
||||
|
||||
// Flatten all diff groups for the search
|
||||
const diffs = diff.map(([slug, subgroups]) => subgroups).flat();
|
||||
for (const subgroup of diffs) {
|
||||
const { both = [], orig = [], recomp = [] } = subgroup;
|
||||
|
||||
// If search includes context
|
||||
if (this.filterType === 2 && both.some(anyLineMatch)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (orig.some(anyLineMatch) || recomp.some(anyLineMatch)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
rowSortFn(rowA, rowB) {
|
||||
const valA = this.sortCol === 'matching'
|
||||
? matchingColAdjustment(rowA)
|
||||
: rowA[this.sortCol];
|
||||
|
||||
const valB = this.sortCol === 'matching'
|
||||
? matchingColAdjustment(rowB)
|
||||
: rowB[this.sortCol];
|
||||
|
||||
if (valA > valB) {
|
||||
return this.sortDesc ? -1 : 1;
|
||||
} else if (valA < valB) {
|
||||
return this.sortDesc ? 1 : -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
pageClamp(page) {
|
||||
return Math.max(0, Math.min(page, this.maxPage()));
|
||||
}
|
||||
|
||||
get page() {
|
||||
return this._page;
|
||||
}
|
||||
|
||||
set page(page) {
|
||||
this._page = this.pageClamp(page);
|
||||
this.callListeners();
|
||||
}
|
||||
|
||||
get filterType() {
|
||||
return parseInt(this._filterType);
|
||||
}
|
||||
|
||||
set filterType(value) {
|
||||
value = parseInt(value);
|
||||
if (value >= 1 && value <= 3) {
|
||||
this._filterType = value;
|
||||
}
|
||||
this.updateResults();
|
||||
}
|
||||
|
||||
get query() {
|
||||
return this._query;
|
||||
}
|
||||
|
||||
set query(value) {
|
||||
// Normalize search string
|
||||
this._query = value.toLowerCase().trim();
|
||||
this.updateResults();
|
||||
}
|
||||
|
||||
get showRecomp() {
|
||||
return this._showRecomp;
|
||||
}
|
||||
|
||||
set showRecomp(value) {
|
||||
// Don't sort by the recomp column we are about to hide
|
||||
if (!value && this.sortCol === 'recomp') {
|
||||
this._sortCol = 'address';
|
||||
}
|
||||
|
||||
this._showRecomp = value;
|
||||
this.callListeners();
|
||||
}
|
||||
|
||||
get sortCol() {
|
||||
return this._sortCol;
|
||||
}
|
||||
|
||||
set sortCol(column) {
|
||||
if (column === this._sortCol) {
|
||||
this._sortDesc = !this._sortDesc;
|
||||
} else {
|
||||
this._sortCol = column;
|
||||
}
|
||||
|
||||
this.updateResults();
|
||||
}
|
||||
|
||||
get sortDesc() {
|
||||
return this._sortDesc;
|
||||
}
|
||||
|
||||
set sortDesc(value) {
|
||||
this._sortDesc = value;
|
||||
this.updateResults();
|
||||
}
|
||||
|
||||
get hidePerfect() {
|
||||
return this._hidePerfect;
|
||||
}
|
||||
|
||||
set hidePerfect(value) {
|
||||
this._hidePerfect = value;
|
||||
this.updateResults();
|
||||
}
|
||||
|
||||
get hideStub() {
|
||||
return this._hideStub;
|
||||
}
|
||||
|
||||
set hideStub(value) {
|
||||
this._hideStub = value;
|
||||
this.updateResults();
|
||||
}
|
||||
}
|
||||
|
||||
const appState = new ListingState();
|
||||
|
||||
//
|
||||
// Custom elements
|
||||
//
|
||||
|
||||
// Sets sort indicator arrow based on element attributes.
|
||||
class SortIndicator extends window.HTMLElement {
|
||||
static observedAttributes = ['data-sort'];
|
||||
|
||||
attributeChangedCallback(name, oldValue, newValue) {
|
||||
if (newValue === null) {
|
||||
// Reserve space for blank indicator so column width stays the same
|
||||
this.innerHTML = ' ';
|
||||
} else {
|
||||
this.innerHTML = newValue === 'asc' ? '▲' : '▼';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class FuncRow extends window.HTMLElement {
|
||||
connectedCallback() {
|
||||
if (this.shadowRoot !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const template = document.querySelector('template#funcrow-template').content;
|
||||
const shadow = this.attachShadow({ mode: 'open' });
|
||||
shadow.appendChild(template.cloneNode(true));
|
||||
shadow.querySelector(':host > div[data-col="name"]').addEventListener('click', evt => {
|
||||
this.dispatchEvent(new Event('name-click'));
|
||||
});
|
||||
}
|
||||
|
||||
get address() {
|
||||
return this.getAttribute('data-address');
|
||||
}
|
||||
}
|
||||
|
||||
class NoDiffMessage extends window.HTMLElement {
|
||||
connectedCallback() {
|
||||
if (this.shadowRoot !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const template = document.querySelector('template#nodiff-template').content;
|
||||
const shadow = this.attachShadow({ mode: 'open' });
|
||||
shadow.appendChild(template.cloneNode(true));
|
||||
}
|
||||
}
|
||||
|
||||
class CanCopy extends window.HTMLElement {
|
||||
connectedCallback() {
|
||||
if (this.shadowRoot !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const template = document.querySelector('template#can-copy-template').content;
|
||||
const shadow = this.attachShadow({ mode: 'open' });
|
||||
shadow.appendChild(template.cloneNode(true));
|
||||
|
||||
const el = shadow.querySelector('slot').assignedNodes()[0];
|
||||
el.addEventListener('mouseout', evt => { this.copied = false; });
|
||||
el.addEventListener('click', evt => {
|
||||
copyToClipboard(evt.target.textContent);
|
||||
this.copied = true;
|
||||
});
|
||||
}
|
||||
|
||||
get copied() {
|
||||
return this.getAttribute('copied');
|
||||
}
|
||||
|
||||
set copied(value) {
|
||||
if (value) {
|
||||
setTimeout(() => { this.copied = false; }, 2000);
|
||||
}
|
||||
setBooleanAttribute(this, 'copied', value);
|
||||
}
|
||||
}
|
||||
|
||||
// Displays asm diff for the given @data-address value.
|
||||
class DiffRow extends window.HTMLElement {
|
||||
connectedCallback() {
|
||||
if (this.shadowRoot !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const template = document.querySelector('template#diffrow-template').content;
|
||||
const shadow = this.attachShadow({ mode: 'open' });
|
||||
shadow.appendChild(template.cloneNode(true));
|
||||
}
|
||||
|
||||
get address() {
|
||||
return this.getAttribute('data-address');
|
||||
}
|
||||
|
||||
set address(value) {
|
||||
this.setAttribute('data-address', value);
|
||||
}
|
||||
}
|
||||
|
||||
class DiffDisplayOptions extends window.HTMLElement {
|
||||
static observedAttributes = ['data-option'];
|
||||
|
||||
connectedCallback() {
|
||||
if (this.shadowRoot !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const shadow = this.attachShadow({ mode: 'open' });
|
||||
shadow.innerHTML = `
|
||||
<style>
|
||||
fieldset {
|
||||
align-items: center;
|
||||
display: flex;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
label {
|
||||
margin-right: 10px;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
label, input {
|
||||
cursor: pointer;
|
||||
}
|
||||
</style>
|
||||
<fieldset>
|
||||
<legend>Address display:</legend>
|
||||
<input type="radio" id="showNone" name="addrDisplay" value=0>
|
||||
<label for="showNone">None</label>
|
||||
<input type="radio" id="showOrig" name="addrDisplay" value=1>
|
||||
<label for="showOrig">Original</label>
|
||||
<input type="radio" id="showBoth" name="addrDisplay" value=2>
|
||||
<label for="showBoth">Both</label>
|
||||
</fieldset>`;
|
||||
|
||||
shadow.querySelectorAll('input[type=radio]').forEach(radio => {
|
||||
const checked = this.option === radio.getAttribute('value');
|
||||
setBooleanAttribute(radio, 'checked', checked);
|
||||
|
||||
radio.addEventListener('change', evt => (this.option = evt.target.value));
|
||||
});
|
||||
}
|
||||
|
||||
set option(value) {
|
||||
this.setAttribute('data-option', parseInt(value));
|
||||
}
|
||||
|
||||
get option() {
|
||||
return this.getAttribute('data-option') ?? 1;
|
||||
}
|
||||
|
||||
attributeChangedCallback(name, oldValue, newValue) {
|
||||
if (name !== 'data-option') {
|
||||
return;
|
||||
}
|
||||
|
||||
this.dispatchEvent(new Event('change'));
|
||||
}
|
||||
}
|
||||
|
||||
class DiffDisplay extends window.HTMLElement {
|
||||
static observedAttributes = ['data-option'];
|
||||
|
||||
connectedCallback() {
|
||||
if (this.querySelector('diff-display-options') !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const optControl = new DiffDisplayOptions();
|
||||
optControl.option = this.option;
|
||||
optControl.addEventListener('change', evt => (this.option = evt.target.option));
|
||||
this.appendChild(optControl);
|
||||
|
||||
const div = document.createElement('div');
|
||||
const obj = getDataByAddr(this.address);
|
||||
|
||||
const createHeaderLine = (text, className) => {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
div.className = className;
|
||||
return div;
|
||||
};
|
||||
|
||||
const groups = obj.diff;
|
||||
groups.forEach(([slug, subgroups]) => {
|
||||
const secondTable = document.createElement('table');
|
||||
secondTable.classList.add('diffTable');
|
||||
|
||||
const hdr = document.createElement('div');
|
||||
hdr.appendChild(createHeaderLine('---', 'diffneg'));
|
||||
hdr.appendChild(createHeaderLine('+++', 'diffpos'));
|
||||
hdr.appendChild(createHeaderLine(slug, 'diffslug'));
|
||||
div.appendChild(hdr);
|
||||
|
||||
const tbody = document.createElement('tbody');
|
||||
secondTable.appendChild(tbody);
|
||||
|
||||
const diffs = formatAsm(subgroups, this.option);
|
||||
for (const el of diffs) {
|
||||
tbody.appendChild(el);
|
||||
}
|
||||
|
||||
div.appendChild(secondTable);
|
||||
});
|
||||
|
||||
this.appendChild(div);
|
||||
}
|
||||
|
||||
get address() {
|
||||
return this.getAttribute('data-address');
|
||||
}
|
||||
|
||||
set address(value) {
|
||||
this.setAttribute('data-address', value);
|
||||
}
|
||||
|
||||
get option() {
|
||||
return this.getAttribute('data-option') ?? 1;
|
||||
}
|
||||
|
||||
set option(value) {
|
||||
this.setAttribute('data-option', value);
|
||||
}
|
||||
}
|
||||
|
||||
class ListingOptions extends window.HTMLElement {
|
||||
constructor() {
|
||||
super();
|
||||
|
||||
// Register to receive updates
|
||||
appState.addListener(() => this.onUpdate());
|
||||
|
||||
const input = this.querySelector('input[type=search]');
|
||||
input.oninput = evt => (appState.query = evt.target.value);
|
||||
|
||||
const hidePerf = this.querySelector('input#cbHidePerfect');
|
||||
hidePerf.onchange = evt => (appState.hidePerfect = evt.target.checked);
|
||||
hidePerf.checked = appState.hidePerfect;
|
||||
|
||||
const hideStub = this.querySelector('input#cbHideStub');
|
||||
hideStub.onchange = evt => (appState.hideStub = evt.target.checked);
|
||||
hideStub.checked = appState.hideStub;
|
||||
|
||||
const showRecomp = this.querySelector('input#cbShowRecomp');
|
||||
showRecomp.onchange = evt => (appState.showRecomp = evt.target.checked);
|
||||
showRecomp.checked = appState.showRecomp;
|
||||
|
||||
this.querySelector('button#pagePrev').addEventListener('click', evt => {
|
||||
appState.page = appState.page - 1;
|
||||
});
|
||||
|
||||
this.querySelector('button#pageNext').addEventListener('click', evt => {
|
||||
appState.page = appState.page + 1;
|
||||
});
|
||||
|
||||
this.querySelector('select#pageSelect').addEventListener('change', evt => {
|
||||
appState.page = evt.target.value;
|
||||
});
|
||||
|
||||
this.querySelectorAll('input[name=filterType]').forEach(radio => {
|
||||
const checked = appState.filterType === parseInt(radio.getAttribute('value'));
|
||||
setBooleanAttribute(radio, 'checked', checked);
|
||||
|
||||
radio.onchange = evt => (appState.filterType = radio.getAttribute('value'));
|
||||
});
|
||||
|
||||
this.onUpdate();
|
||||
}
|
||||
|
||||
onUpdate() {
|
||||
// Update input placeholder based on search type
|
||||
this.querySelector('input[type=search]').placeholder = appState.filterType === 1
|
||||
? 'Search for offset or function name...'
|
||||
: 'Search for instruction...';
|
||||
|
||||
// Update page number and max page
|
||||
this.querySelector('fieldset#pageDisplay > legend').textContent = `Page ${appState.page + 1} of ${Math.max(1, appState.pageCount())}`;
|
||||
|
||||
// Disable prev/next buttons on first/last page
|
||||
setBooleanAttribute(this.querySelector('button#pagePrev'), 'disabled', appState.page === 0);
|
||||
setBooleanAttribute(this.querySelector('button#pageNext'), 'disabled', appState.page === appState.maxPage());
|
||||
|
||||
// Update page select dropdown
|
||||
const pageSelect = this.querySelector('select#pageSelect');
|
||||
setBooleanAttribute(pageSelect, 'disabled', appState.resultsCount() === 0);
|
||||
pageSelect.innerHTML = '';
|
||||
|
||||
if (appState.resultsCount() === 0) {
|
||||
const opt = document.createElement('option');
|
||||
opt.textContent = '- no results -';
|
||||
pageSelect.appendChild(opt);
|
||||
} else {
|
||||
for (const row of appState.pageHeadings()) {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = row[0];
|
||||
if (appState.page === row[0]) {
|
||||
opt.setAttribute('selected', '');
|
||||
}
|
||||
|
||||
const [start, end] = [row[1], row[2]];
|
||||
|
||||
opt.textContent = `${appState.sortCol}: ${start} to ${end}`;
|
||||
pageSelect.appendChild(opt);
|
||||
}
|
||||
}
|
||||
|
||||
// Update row count
|
||||
this.querySelector('#rowcount').textContent = `${appState.resultsCount()}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Main application.
|
||||
class ListingTable extends window.HTMLElement {
|
||||
constructor() {
|
||||
super();
|
||||
|
||||
// Register to receive updates
|
||||
appState.addListener(() => this.somethingChanged());
|
||||
}
|
||||
|
||||
setDiffRow(address, shouldExpand) {
|
||||
const tbody = this.querySelector('tbody');
|
||||
const funcrow = tbody.querySelector(`func-row[data-address="${address}"]`);
|
||||
if (funcrow === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
const existing = tbody.querySelector(`diff-row[data-address="${address}"]`);
|
||||
if (existing !== null) {
|
||||
if (!shouldExpand) {
|
||||
tbody.removeChild(existing);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const diffrow = document.createElement('diff-row');
|
||||
diffrow.address = address;
|
||||
|
||||
// Decide what goes inside the diff row.
|
||||
const obj = getDataByAddr(address);
|
||||
|
||||
if ('stub' in obj) {
|
||||
const msg = document.createElement('no-diff');
|
||||
const p = document.createElement('div');
|
||||
p.innerText = 'Stub. No diff.';
|
||||
msg.appendChild(p);
|
||||
diffrow.appendChild(msg);
|
||||
} else if (obj.diff.length === 0) {
|
||||
const msg = document.createElement('no-diff');
|
||||
const p = document.createElement('div');
|
||||
p.innerText = 'Identical function - no diff';
|
||||
msg.appendChild(p);
|
||||
diffrow.appendChild(msg);
|
||||
} else {
|
||||
const dd = new DiffDisplay();
|
||||
dd.option = '1';
|
||||
dd.address = address;
|
||||
diffrow.appendChild(dd);
|
||||
}
|
||||
|
||||
// Insert the diff row after the parent func row.
|
||||
tbody.insertBefore(diffrow, funcrow.nextSibling);
|
||||
}
|
||||
|
||||
connectedCallback() {
|
||||
const thead = this.querySelector('thead');
|
||||
const headers = thead.querySelectorAll('th:not([data-no-sort])'); // TODO
|
||||
headers.forEach(th => {
|
||||
const col = th.getAttribute('data-col');
|
||||
if (col) {
|
||||
const span = th.querySelector('span');
|
||||
if (span) {
|
||||
span.addEventListener('click', evt => { appState.sortCol = col; });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this.somethingChanged();
|
||||
}
|
||||
|
||||
somethingChanged() {
|
||||
// Toggle recomp/diffs column
|
||||
setBooleanAttribute(this.querySelector('table'), 'show-recomp', appState.showRecomp);
|
||||
this.querySelectorAll('func-row[data-address]').forEach(row => {
|
||||
setBooleanAttribute(row, 'show-recomp', appState.showRecomp);
|
||||
});
|
||||
|
||||
const thead = this.querySelector('thead');
|
||||
const headers = thead.querySelectorAll('th');
|
||||
|
||||
// Update sort indicator
|
||||
headers.forEach(th => {
|
||||
const col = th.getAttribute('data-col');
|
||||
const indicator = th.querySelector('sort-indicator');
|
||||
if (indicator === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (appState.sortCol === col) {
|
||||
indicator.setAttribute('data-sort', appState.sortDesc ? 'desc' : 'asc');
|
||||
} else {
|
||||
indicator.removeAttribute('data-sort');
|
||||
}
|
||||
});
|
||||
|
||||
// Add the rows
|
||||
const tbody = this.querySelector('tbody');
|
||||
tbody.innerHTML = ''; // ?
|
||||
|
||||
for (const obj of appState.pageSlice()) {
|
||||
const row = document.createElement('func-row');
|
||||
row.setAttribute('data-address', obj.address); // ?
|
||||
row.addEventListener('name-click', evt => {
|
||||
appState.toggleExpanded(obj.address);
|
||||
this.setDiffRow(obj.address, appState.isExpanded(obj.address));
|
||||
});
|
||||
setBooleanAttribute(row, 'show-recomp', appState.showRecomp);
|
||||
setBooleanAttribute(row, 'expanded', appState.isExpanded(row));
|
||||
|
||||
const items = [
|
||||
['address', obj.address],
|
||||
['recomp', obj.recomp],
|
||||
['name', obj.name],
|
||||
['diffs', countDiffs(obj)],
|
||||
['matching', getMatchPercentText(obj)]
|
||||
];
|
||||
|
||||
items.forEach(([slotName, content]) => {
|
||||
const div = document.createElement('span');
|
||||
div.setAttribute('slot', slotName);
|
||||
div.innerText = content;
|
||||
row.appendChild(div);
|
||||
});
|
||||
|
||||
tbody.appendChild(row);
|
||||
|
||||
if (appState.isExpanded(obj.address)) {
|
||||
this.setDiffRow(obj.address, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
window.onload = () => {
|
||||
window.customElements.define('listing-table', ListingTable);
|
||||
window.customElements.define('listing-options', ListingOptions);
|
||||
window.customElements.define('diff-display', DiffDisplay);
|
||||
window.customElements.define('diff-display-options', DiffDisplayOptions);
|
||||
window.customElements.define('sort-indicator', SortIndicator);
|
||||
window.customElements.define('func-row', FuncRow);
|
||||
window.customElements.define('diff-row', DiffRow);
|
||||
window.customElements.define('no-diff', NoDiffMessage);
|
||||
window.customElements.define('can-copy', CanCopy);
|
||||
};
|
@ -1,344 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from isledecomp import (
|
||||
Bin,
|
||||
get_file_in_script_dir,
|
||||
print_combined_diff,
|
||||
diff_json,
|
||||
percent_string,
|
||||
)
|
||||
from isledecomp.compare import Compare as IsleCompare
|
||||
from isledecomp.types import SymbolType
|
||||
from pystache import Renderer
|
||||
import colorama
|
||||
|
||||
colorama.just_fix_windows_console()
|
||||
|
||||
|
||||
def gen_json(json_file: str, orig_file: str, data):
|
||||
"""Create a JSON file that contains the comparison summary"""
|
||||
|
||||
# If the structure of the JSON file ever changes, we would run into a problem
|
||||
# reading an older format file in the CI action. Mark which version we are
|
||||
# generating so we could potentially address this down the road.
|
||||
json_format_version = 1
|
||||
|
||||
# Remove the diff field
|
||||
reduced_data = [
|
||||
{key: value for (key, value) in obj.items() if key != "diff"} for obj in data
|
||||
]
|
||||
|
||||
with open(json_file, "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
"file": os.path.basename(orig_file).lower(),
|
||||
"format": json_format_version,
|
||||
"timestamp": datetime.now().timestamp(),
|
||||
"data": reduced_data,
|
||||
},
|
||||
f,
|
||||
)
|
||||
|
||||
|
||||
def gen_html(html_file, data):
|
||||
js_path = get_file_in_script_dir("reccmp.js")
|
||||
with open(js_path, "r", encoding="utf-8") as f:
|
||||
reccmp_js = f.read()
|
||||
|
||||
output_data = Renderer().render_path(
|
||||
get_file_in_script_dir("template.html"), {"data": data, "reccmp_js": reccmp_js}
|
||||
)
|
||||
|
||||
with open(html_file, "w", encoding="utf-8") as htmlfile:
|
||||
htmlfile.write(output_data)
|
||||
|
||||
|
||||
def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_accuracy):
|
||||
icon_data = None
|
||||
if icon:
|
||||
with open(icon, "rb") as iconfile:
|
||||
icon_data = base64.b64encode(iconfile.read()).decode("utf-8")
|
||||
|
||||
total_statistic = raw_accuracy / total_funcs
|
||||
full_percentbar_width = 127.18422
|
||||
output_data = Renderer().render_path(
|
||||
get_file_in_script_dir("template.svg"),
|
||||
{
|
||||
"name": name_svg,
|
||||
"icon": icon_data,
|
||||
"implemented": f"{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})",
|
||||
"accuracy": f"{(raw_accuracy / svg_implemented_funcs * 100):.2f}%",
|
||||
"progbar": total_statistic * full_percentbar_width,
|
||||
"percent": f"{(total_statistic * 100):.2f}%",
|
||||
},
|
||||
)
|
||||
with open(svg_file, "w", encoding="utf-8") as svgfile:
|
||||
svgfile.write(output_data)
|
||||
|
||||
|
||||
def print_match_verbose(match, show_both_addrs: bool = False, is_plain: bool = False):
|
||||
percenttext = percent_string(
|
||||
match.effective_ratio, match.is_effective_match, is_plain
|
||||
)
|
||||
|
||||
if show_both_addrs:
|
||||
addrs = f"0x{match.orig_addr:x} / 0x{match.recomp_addr:x}"
|
||||
else:
|
||||
addrs = hex(match.orig_addr)
|
||||
|
||||
if match.is_stub:
|
||||
print(f"{addrs}: {match.name} is a stub. No diff.")
|
||||
return
|
||||
|
||||
if match.effective_ratio == 1.0:
|
||||
ok_text = (
|
||||
"OK!"
|
||||
if is_plain
|
||||
else (colorama.Fore.GREEN + "✨ OK! ✨" + colorama.Style.RESET_ALL)
|
||||
)
|
||||
if match.ratio == 1.0:
|
||||
print(f"{addrs}: {match.name} 100% match.\n\n{ok_text}\n\n")
|
||||
else:
|
||||
print(
|
||||
f"{addrs}: {match.name} Effective 100% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n"
|
||||
)
|
||||
else:
|
||||
print_combined_diff(match.udiff, is_plain, show_both_addrs)
|
||||
|
||||
print(
|
||||
f"\n{match.name} is only {percenttext} similar to the original, diff above"
|
||||
)
|
||||
|
||||
|
||||
def print_match_oneline(match, show_both_addrs: bool = False, is_plain: bool = False):
|
||||
percenttext = percent_string(
|
||||
match.effective_ratio, match.is_effective_match, is_plain
|
||||
)
|
||||
|
||||
if show_both_addrs:
|
||||
addrs = f"0x{match.orig_addr:x} / 0x{match.recomp_addr:x}"
|
||||
else:
|
||||
addrs = hex(match.orig_addr)
|
||||
|
||||
if match.is_stub:
|
||||
print(f" {match.name} ({addrs}) is a stub.")
|
||||
else:
|
||||
print(f" {match.name} ({addrs}) is {percenttext} similar to the original")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
def virtual_address(value) -> int:
|
||||
"""Helper method for argparse, verbose parameter"""
|
||||
return int(value, 16)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
allow_abbrev=False,
|
||||
description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"original", metavar="original-binary", help="The original binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--total",
|
||||
"-T",
|
||||
metavar="<count>",
|
||||
help="Total number of expected functions (improves total accuracy statistic)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
metavar="<offset>",
|
||||
type=virtual_address,
|
||||
help="Print assembly diff for specific function (original file's offset)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
metavar="<file>",
|
||||
help="Generate JSON file with match summary",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--diff",
|
||||
metavar="<file>",
|
||||
help="Diff against summary in JSON file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--html",
|
||||
"-H",
|
||||
metavar="<file>",
|
||||
help="Generate searchable HTML summary of status and diffs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-color", "-n", action="store_true", help="Do not color the output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--svg", "-S", metavar="<file>", help="Generate SVG graphic of progress"
|
||||
)
|
||||
parser.add_argument("--svg-icon", metavar="icon", help="Icon to use in SVG (PNG)")
|
||||
parser.add_argument(
|
||||
"--print-rec-addr",
|
||||
action="store_true",
|
||||
help="Print addresses of recompiled functions too",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--silent",
|
||||
action="store_true",
|
||||
help="Don't display text summary of matches",
|
||||
)
|
||||
|
||||
parser.set_defaults(loglevel=logging.INFO)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_const",
|
||||
const=logging.DEBUG,
|
||||
dest="loglevel",
|
||||
help="Print script debug information",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isfile(args.original):
|
||||
parser.error(f"Original binary {args.original} does not exist")
|
||||
|
||||
if not os.path.isfile(args.recompiled):
|
||||
parser.error(f"Recompiled binary {args.recompiled} does not exist")
|
||||
|
||||
if not os.path.isfile(args.pdb):
|
||||
parser.error(f"Symbols PDB {args.pdb} does not exist")
|
||||
|
||||
if not os.path.isdir(args.decomp_dir):
|
||||
parser.error(f"Source directory {args.decomp_dir} does not exist")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s")
|
||||
|
||||
with Bin(args.original, find_str=True) as origfile, Bin(
|
||||
args.recompiled
|
||||
) as recompfile:
|
||||
if args.verbose is not None:
|
||||
# Mute logger events from compare engine
|
||||
logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL)
|
||||
|
||||
isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)
|
||||
|
||||
if args.loglevel == logging.DEBUG:
|
||||
isle_compare.debug = True
|
||||
|
||||
print()
|
||||
|
||||
### Compare one or none.
|
||||
|
||||
if args.verbose is not None:
|
||||
match = isle_compare.compare_address(args.verbose)
|
||||
if match is None:
|
||||
print(f"Failed to find a match at address 0x{args.verbose:x}")
|
||||
return
|
||||
|
||||
print_match_verbose(
|
||||
match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
|
||||
)
|
||||
return
|
||||
|
||||
### Compare everything.
|
||||
|
||||
function_count = 0
|
||||
total_accuracy = 0
|
||||
total_effective_accuracy = 0
|
||||
htmlinsert = []
|
||||
|
||||
for match in isle_compare.compare_all():
|
||||
if not args.silent and args.diff is None:
|
||||
print_match_oneline(
|
||||
match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
|
||||
)
|
||||
|
||||
if match.match_type == SymbolType.FUNCTION and not match.is_stub:
|
||||
function_count += 1
|
||||
total_accuracy += match.ratio
|
||||
total_effective_accuracy += match.effective_ratio
|
||||
|
||||
# If html, record the diffs to an HTML file
|
||||
html_obj = {
|
||||
"address": f"0x{match.orig_addr:x}",
|
||||
"recomp": f"0x{match.recomp_addr:x}",
|
||||
"name": match.name,
|
||||
"matching": match.effective_ratio,
|
||||
}
|
||||
|
||||
if match.is_effective_match:
|
||||
html_obj["effective"] = True
|
||||
|
||||
if match.udiff is not None:
|
||||
html_obj["diff"] = match.udiff
|
||||
|
||||
if match.is_stub:
|
||||
html_obj["stub"] = True
|
||||
|
||||
htmlinsert.append(html_obj)
|
||||
|
||||
# Compare with saved diff report.
|
||||
if args.diff is not None:
|
||||
with open(args.diff, "r", encoding="utf-8") as f:
|
||||
saved_data = json.load(f)
|
||||
|
||||
diff_json(
|
||||
saved_data,
|
||||
htmlinsert,
|
||||
args.original,
|
||||
show_both_addrs=args.print_rec_addr,
|
||||
is_plain=args.no_color,
|
||||
)
|
||||
|
||||
## Generate files and show summary.
|
||||
|
||||
if args.json is not None:
|
||||
gen_json(args.json, args.original, htmlinsert)
|
||||
|
||||
if args.html is not None:
|
||||
gen_html(args.html, json.dumps(htmlinsert))
|
||||
|
||||
implemented_funcs = function_count
|
||||
|
||||
if args.total:
|
||||
function_count = int(args.total)
|
||||
|
||||
if function_count > 0:
|
||||
effective_accuracy = total_effective_accuracy / function_count * 100
|
||||
actual_accuracy = total_accuracy / function_count * 100
|
||||
print(
|
||||
f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)"
|
||||
)
|
||||
|
||||
if args.svg is not None:
|
||||
gen_svg(
|
||||
args.svg,
|
||||
os.path.basename(args.original),
|
||||
args.svg_icon,
|
||||
implemented_funcs,
|
||||
function_count,
|
||||
total_effective_accuracy,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
@ -1,365 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Decompilation Status</title>
|
||||
<style>
|
||||
body {
|
||||
background: #202020;
|
||||
color: #f0f0f0;
|
||||
font-family: sans-serif;
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.main {
|
||||
width: 800px;
|
||||
max-width: 100%;
|
||||
margin: auto;
|
||||
}
|
||||
|
||||
#search {
|
||||
width: 100%;
|
||||
box-sizing: border-box;
|
||||
background: #303030;
|
||||
color: #f0f0f0;
|
||||
border: 1px #f0f0f0 solid;
|
||||
padding: 0.5em;
|
||||
border-radius: 0.5em;
|
||||
}
|
||||
|
||||
#search::placeholder {
|
||||
color: #b0b0b0;
|
||||
}
|
||||
|
||||
#listing {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-family: monospace;
|
||||
}
|
||||
|
||||
func-row:hover {
|
||||
background: #404040 !important;
|
||||
}
|
||||
|
||||
func-row:nth-child(odd of :not([hidden])), #listing > thead th {
|
||||
background: #282828;
|
||||
}
|
||||
|
||||
func-row:nth-child(even of :not([hidden])) {
|
||||
background: #383838;
|
||||
}
|
||||
|
||||
table#listing {
|
||||
border: 1px #f0f0f0 solid;
|
||||
}
|
||||
|
||||
#listing > thead th {
|
||||
padding: 0.5em;
|
||||
user-select: none;
|
||||
width: 10%;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
#listing:not([show-recomp]) > thead th[data-col="recomp"] {
|
||||
display: none;
|
||||
}
|
||||
|
||||
#listing > thead th > div {
|
||||
display: flex;
|
||||
column-gap: 0.5em;
|
||||
}
|
||||
|
||||
#listing > thead th > div > span {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
#listing > thead th > div > span:hover {
|
||||
text-decoration: underline;
|
||||
text-decoration-style: dotted;
|
||||
}
|
||||
|
||||
#listing > thead th:last-child > div {
|
||||
justify-content: right;
|
||||
}
|
||||
|
||||
#listing > thead th[data-col="name"] {
|
||||
width: 60%;
|
||||
}
|
||||
|
||||
.diffneg {
|
||||
color: #FF8080;
|
||||
}
|
||||
|
||||
.diffpos {
|
||||
color: #80FF80;
|
||||
}
|
||||
|
||||
.diffslug {
|
||||
color: #8080FF;
|
||||
}
|
||||
|
||||
.identical {
|
||||
font-style: italic;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
sort-indicator {
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
.filters {
|
||||
align-items: top;
|
||||
display: flex;
|
||||
font-size: 10pt;
|
||||
justify-content: space-between;
|
||||
margin: 0.5em 0 1em 0;
|
||||
}
|
||||
|
||||
.filters > fieldset {
|
||||
/* checkbox and radio buttons v-aligned with text */
|
||||
align-items: center;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.filters > fieldset > input, .filters > fieldset > label {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.filters > fieldset > label {
|
||||
margin-right: 10px;
|
||||
}
|
||||
|
||||
table.diffTable {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
table.diffTable:not(:last-child) {
|
||||
/* visual gap *between* diff context groups */
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
table.diffTable td, table.diffTable th {
|
||||
border: 0 none;
|
||||
padding: 0 10px 0 0;
|
||||
}
|
||||
|
||||
table.diffTable th {
|
||||
/* don't break address if asm line is long */
|
||||
word-break: keep-all;
|
||||
}
|
||||
|
||||
diff-display[data-option="0"] th:nth-child(1) {
|
||||
display: none;
|
||||
}
|
||||
|
||||
diff-display[data-option="0"] th:nth-child(2),
|
||||
diff-display[data-option="1"] th:nth-child(2) {
|
||||
display: none;
|
||||
}
|
||||
|
||||
label {
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
#pageDisplay > button {
|
||||
cursor: pointer;
|
||||
padding: 0.25em 0.5em;
|
||||
}
|
||||
|
||||
#pageDisplay select {
|
||||
cursor: pointer;
|
||||
padding: 0.25em;
|
||||
margin: 0 0.5em;
|
||||
}
|
||||
|
||||
p.rowcount {
|
||||
align-self: flex-end;
|
||||
font-size: 1.2em;
|
||||
margin-bottom: 0;
|
||||
}
|
||||
</style>
|
||||
<script>var data = {{{data}}};</script>
|
||||
<script>{{{reccmp_js}}}</script>
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="main">
|
||||
<h1>Decompilation Status</h1>
|
||||
<listing-options>
|
||||
<input id="search" type="search" placeholder="Search for offset or function name...">
|
||||
<div class="filters">
|
||||
<fieldset>
|
||||
<legend>Options:</legend>
|
||||
<input type="checkbox" id="cbHidePerfect" />
|
||||
<label for="cbHidePerfect">Hide 100% match</label>
|
||||
<input type="checkbox" id="cbHideStub" />
|
||||
<label for="cbHideStub">Hide stubs</label>
|
||||
<input type="checkbox" id="cbShowRecomp" />
|
||||
<label for="cbShowRecomp">Show recomp address</label>
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
<legend>Search filters on:</legend>
|
||||
<input type="radio" name="filterType" id="filterName" value=1 checked />
|
||||
<label for="filterName">Name/address</label>
|
||||
<input type="radio" name="filterType" id="filterAsm" value=2 />
|
||||
<label for="filterAsm">Asm output</label>
|
||||
<input type="radio" name="filterType" id="filterDiff" value=3 />
|
||||
<label for="filterDiff">Asm diffs only</label>
|
||||
</fieldset>
|
||||
</div>
|
||||
<div class="filters">
|
||||
<p class="rowcount">Results: <span id="rowcount"></span></p>
|
||||
<fieldset id="pageDisplay">
|
||||
<legend>Page</legend>
|
||||
<button id="pagePrev">prev</button>
|
||||
<select id="pageSelect">
|
||||
</select>
|
||||
<button id="pageNext">next</button>
|
||||
</fieldset>
|
||||
</div>
|
||||
</listing-options>
|
||||
<listing-table>
|
||||
<table id="listing">
|
||||
<thead>
|
||||
<tr>
|
||||
<th data-col="address">
|
||||
<div>
|
||||
<span>Address</span>
|
||||
<sort-indicator/>
|
||||
</div>
|
||||
</th>
|
||||
<th data-col="recomp">
|
||||
<div>
|
||||
<span>Recomp</span>
|
||||
<sort-indicator/>
|
||||
</div>
|
||||
</th>
|
||||
<th data-col="name">
|
||||
<div>
|
||||
<span>Name</span>
|
||||
<sort-indicator/>
|
||||
</div>
|
||||
</th>
|
||||
<th data-col="diffs" data-no-sort></th>
|
||||
<th data-col="matching">
|
||||
<div>
|
||||
<sort-indicator></sort-indicator>
|
||||
<span>Matching</span>
|
||||
</div>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
</tbody>
|
||||
</table>
|
||||
</listing-table>
|
||||
</div>
|
||||
<template id="funcrow-template">
|
||||
<style>
|
||||
:host(:not([hidden])) {
|
||||
display: table-row;
|
||||
}
|
||||
|
||||
:host(:not([show-recomp])) > div[data-col="recomp"] {
|
||||
display: none;
|
||||
}
|
||||
|
||||
div[data-col="name"]:hover {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
div[data-col="name"]:hover > ::slotted(*) {
|
||||
text-decoration: underline;
|
||||
text-decoration-style: dotted;
|
||||
}
|
||||
|
||||
::slotted(*:not([slot="name"])) {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
:host > div {
|
||||
border-top: 1px #f0f0f0 solid;
|
||||
display: table-cell;
|
||||
padding: 0.5em;
|
||||
word-break: break-all !important;
|
||||
}
|
||||
|
||||
:host > div:last-child {
|
||||
text-align: right;
|
||||
}
|
||||
</style>
|
||||
<div data-col="address"><can-copy><slot name="address"></slot></can-copy></div>
|
||||
<div data-col="recomp"><can-copy><slot name="recomp"></slot></can-copy></div>
|
||||
<div data-col="name"><slot name="name"></slot></div>
|
||||
<div data-col="diffs"><slot name="diffs"></slot></div>
|
||||
<div data-col="matching"><slot name="matching"></slot></div>
|
||||
</template>
|
||||
<template id="diffrow-template">
|
||||
<style>
|
||||
:host(:not([hidden])) {
|
||||
display: table-row;
|
||||
contain: paint;
|
||||
}
|
||||
|
||||
td.singleCell {
|
||||
border: 1px #f0f0f0 solid;
|
||||
border-bottom: 0px none;
|
||||
display: table-cell;
|
||||
padding: 0.5em;
|
||||
word-break: break-all !important;
|
||||
}
|
||||
</style>
|
||||
<td class="singleCell" colspan="5">
|
||||
<slot></slot>
|
||||
</td>
|
||||
</template>
|
||||
<template id="nodiff-template">
|
||||
<style>
|
||||
::slotted(*) {
|
||||
font-style: italic;
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
<slot></slot>
|
||||
</template>
|
||||
<template id="can-copy-template">
|
||||
<style>
|
||||
:host {
|
||||
position: relative;
|
||||
}
|
||||
::slotted(*) {
|
||||
cursor: pointer;
|
||||
}
|
||||
slot::after {
|
||||
background-color: #fff;
|
||||
color: #222;
|
||||
display: none;
|
||||
font-size: 12px;
|
||||
padding: 1px 2px;
|
||||
width: fit-content;
|
||||
border-radius: 1px;
|
||||
text-align: center;
|
||||
bottom: 120%;
|
||||
box-shadow: 0 4px 14px 0 rgba(0,0,0,.2), 0 0 0 1px rgba(0,0,0,.05);
|
||||
position: absolute;
|
||||
white-space: nowrap;
|
||||
transition: .1s;
|
||||
content: 'Copy to clipboard';
|
||||
}
|
||||
::slotted(*:hover) {
|
||||
text-decoration: underline;
|
||||
text-decoration-style: dotted;
|
||||
}
|
||||
slot:hover::after {
|
||||
display: block;
|
||||
}
|
||||
:host([copied]) > slot:hover::after {
|
||||
content: 'Copied!';
|
||||
}
|
||||
</style>
|
||||
<slot></slot>
|
||||
</template>
|
||||
</body>
|
||||
</html>
|
@ -1,119 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
width="640"
|
||||
height="480"
|
||||
viewBox="0 0 169.33333 127"
|
||||
version="1.1"
|
||||
id="svg5"
|
||||
xml:space="preserve"
|
||||
sodipodi:docname="template.svg"
|
||||
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"><sodipodi:namedview
|
||||
id="namedview26"
|
||||
pagecolor="#505050"
|
||||
bordercolor="#eeeeee"
|
||||
borderopacity="1"
|
||||
inkscape:showpageshadow="0"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pagecheckerboard="0"
|
||||
inkscape:deskcolor="#505050"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.6046875"
|
||||
inkscape:cx="158.90944"
|
||||
inkscape:cy="220.6037"
|
||||
inkscape:window-width="2560"
|
||||
inkscape:window-height="1379"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="g1273" /><defs
|
||||
id="defs5">
|
||||
<clipPath
|
||||
id="progBarCutoff">
|
||||
<rect
|
||||
width="{{progbar}}"
|
||||
height="8.6508904"
|
||||
x="21.118132"
|
||||
y="134.05507"
|
||||
id="rect2" />
|
||||
</clipPath>
|
||||
</defs><g
|
||||
id="g1273"
|
||||
transform="matrix(1.2683581,0,0,1.2683581,-22.720969,-65.913871)"><image
|
||||
width="53.066437"
|
||||
height="53.066437"
|
||||
preserveAspectRatio="none"
|
||||
style="image-rendering:optimizeSpeed"
|
||||
xlink:href="data:image/png;base64,{{icon}}"
|
||||
id="image1060"
|
||||
x="58.13345"
|
||||
y="51.967873" /><text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:12.7px;font-family:monospace;-inkscape-font-specification:mono;text-align:center;text-anchor:middle;fill:#ffffff;stroke:#000000;stroke-width:1.25161812;stroke-opacity:1;stroke-dasharray:none;paint-order:stroke fill markers"
|
||||
x="84.666656"
|
||||
y="118.35877"
|
||||
id="text740"><tspan
|
||||
id="tspan738"
|
||||
style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:monospace;-inkscape-font-specification:mono;text-align:center;text-anchor:middle;stroke:#000000;stroke-width:1.25161812;stroke-opacity:1;stroke-dasharray:none;paint-order:stroke fill markers"
|
||||
x="84.666656"
|
||||
y="118.35877">{{name}}</tspan></text><g
|
||||
id="g1250"
|
||||
transform="translate(-0.04358834,8.1397473)"><rect
|
||||
style="display:inline;fill:none;fill-opacity:1;stroke:#000000;stroke-width:2.50324;stroke-dasharray:none;stroke-opacity:1"
|
||||
id="rect1619"
|
||||
width="127.18422"
|
||||
height="8.6508904"
|
||||
x="21.118132"
|
||||
y="134.05507" /><rect
|
||||
style="display:inline;fill:#000000;fill-opacity:1;stroke:#ffffff;stroke-width:0.87411;stroke-dasharray:none;stroke-opacity:1"
|
||||
id="rect1167"
|
||||
width="127.18422"
|
||||
height="8.6508904"
|
||||
x="21.118132"
|
||||
y="134.05507" /><text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:4.23333px;font-family:monospace;-inkscape-font-specification:mono;text-align:start;text-anchor:start;fill:#ffffff;fill-opacity:1;stroke:none;stroke-width:1.05833;stroke-dasharray:none;stroke-opacity:1"
|
||||
x="76.884926"
|
||||
y="139.89182"
|
||||
id="text2152"><tspan
|
||||
style="font-size:4.23333px;fill:#ffffff;fill-opacity:1;stroke-width:1.05833"
|
||||
x="76.884926"
|
||||
y="139.89182"
|
||||
id="tspan2150">{{percent}}</tspan></text><rect
|
||||
style="display:inline;fill:#ffffff;stroke:none;stroke-width:2.6764"
|
||||
id="rect1169"
|
||||
width="127.18422"
|
||||
height="8.6508904"
|
||||
x="21.118132"
|
||||
y="134.05507"
|
||||
clip-path="url(#progBarCutoff)" /><text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:4.23333px;font-family:monospace;-inkscape-font-specification:mono;text-align:start;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1.05833;stroke-dasharray:none;stroke-opacity:1"
|
||||
x="76.884926"
|
||||
y="139.89182"
|
||||
id="text18"
|
||||
clip-path="url(#progBarCutoff)"
|
||||
inkscape:label="text18"><tspan
|
||||
style="font-size:4.23333px;fill:#000000;fill-opacity:1;stroke-width:1.05833"
|
||||
x="76.884926"
|
||||
y="139.89182"
|
||||
id="tspan16">{{percent}}</tspan></text></g><text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:4.23333px;font-family:monospace;-inkscape-font-specification:mono;text-align:start;text-anchor:start;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.83441208;stroke-dasharray:none;stroke-opacity:1;opacity:1;stroke-linejoin:miter;stroke-linecap:butt;paint-order:stroke fill markers"
|
||||
x="46.947659"
|
||||
y="129.67447"
|
||||
id="text1260"><tspan
|
||||
id="tspan1258"
|
||||
style="font-size:4.23333px;stroke-width:0.83441208;stroke:#000000;stroke-opacity:1;stroke-dasharray:none;stroke-linejoin:miter;stroke-linecap:butt;paint-order:stroke fill markers"
|
||||
x="46.947659"
|
||||
y="129.67447">Implemented: {{implemented}}</tspan><tspan
|
||||
style="font-size:4.23333px;stroke-width:0.83441208;stroke:#000000;stroke-opacity:1;stroke-dasharray:none;stroke-linejoin:miter;stroke-linecap:butt;paint-order:stroke fill markers"
|
||||
x="46.947659"
|
||||
y="134.96613"
|
||||
id="tspan1262">Accuracy: {{accuracy}}</tspan></text></g></svg>
|
Before Width: | Height: | Size: 5.6 KiB |
@ -1,11 +1,3 @@
|
||||
tools/isledecomp
|
||||
capstone
|
||||
reccmp @ git+https://github.com/isledecomp/reccmp
|
||||
clang==16.*
|
||||
colorama>=0.4.6
|
||||
isledecomp
|
||||
pystache
|
||||
pyyaml
|
||||
git+https://github.com/wbenny/pydemangler.git
|
||||
# requirement of capstone due to python dropping distutils.
|
||||
# see: https://github.com/capstone-engine/capstone/issues/2223
|
||||
setuptools ; python_version >= "3.12"
|
@ -1,494 +0,0 @@
|
||||
"""For all addresses matched by code annotations or recomp pdb,
|
||||
report how "far off" the recomp symbol is from its proper place
|
||||
in the original binary."""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
import statistics
|
||||
import bisect
|
||||
from typing import Iterator, List, Optional, Tuple
|
||||
from collections import namedtuple
|
||||
from isledecomp import Bin as IsleBin
|
||||
from isledecomp.bin import InvalidVirtualAddressError
|
||||
from isledecomp.cvdump import Cvdump
|
||||
from isledecomp.compare import Compare as IsleCompare
|
||||
from isledecomp.types import SymbolType
|
||||
|
||||
# Ignore all compare-db messages.
|
||||
logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
|
||||
|
||||
|
||||
def or_blank(value) -> str:
|
||||
"""Helper for dealing with potential None values in text output."""
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
class ModuleMap:
|
||||
"""Load a subset of sections from the pdb to allow you to look up the
|
||||
module number based on the recomp address."""
|
||||
|
||||
def __init__(self, pdb, binfile) -> None:
|
||||
cvdump = Cvdump(pdb).section_contributions().modules().run()
|
||||
self.module_lookup = {m.id: (m.lib, m.obj) for m in cvdump.modules}
|
||||
self.library_lookup = {m.obj: m.lib for m in cvdump.modules}
|
||||
self.section_contrib = [
|
||||
(
|
||||
binfile.get_abs_addr(sizeref.section, sizeref.offset),
|
||||
sizeref.size,
|
||||
sizeref.module,
|
||||
)
|
||||
for sizeref in cvdump.sizerefs
|
||||
if binfile.is_valid_section(sizeref.section)
|
||||
]
|
||||
|
||||
# For bisect performance enhancement
|
||||
self.contrib_starts = [start for (start, _, __) in self.section_contrib]
|
||||
|
||||
def get_lib_for_module(self, module: str) -> Optional[str]:
|
||||
return self.library_lookup.get(module)
|
||||
|
||||
def get_all_cmake_modules(self) -> List[str]:
|
||||
return [
|
||||
obj
|
||||
for (_, (__, obj)) in self.module_lookup.items()
|
||||
if obj.startswith("CMakeFiles")
|
||||
]
|
||||
|
||||
def get_module(self, addr: int) -> Optional[str]:
|
||||
i = bisect.bisect_left(self.contrib_starts, addr)
|
||||
# If the addr matches the section contribution start, we are in the
|
||||
# right spot. Otherwise, we need to subtract one here.
|
||||
# We don't want the insertion point given by bisect, but the
|
||||
# section contribution that contains the address.
|
||||
|
||||
(potential_start, _, __) = self.section_contrib[i]
|
||||
if potential_start != addr:
|
||||
i -= 1
|
||||
|
||||
# Safety catch: clamp to range of indices from section_contrib.
|
||||
i = max(0, min(i, len(self.section_contrib) - 1))
|
||||
|
||||
(start, size, module_id) = self.section_contrib[i]
|
||||
if start <= addr < start + size:
|
||||
if (module := self.module_lookup.get(module_id)) is not None:
|
||||
return module
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def print_sections(sections):
|
||||
print(" name | start | v.size | raw size")
|
||||
print("---------|----------|----------|----------")
|
||||
for sect in sections:
|
||||
name = sect.name
|
||||
print(
|
||||
f"{name:>8} | {sect.virtual_address:8x} | {sect.virtual_size:8x} | {sect.size_of_raw_data:8x}"
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
ALLOWED_TYPE_ABBREVIATIONS = ["fun", "dat", "poi", "str", "vta", "flo"]
|
||||
|
||||
|
||||
def match_type_abbreviation(mtype: Optional[SymbolType]) -> str:
|
||||
"""Return abbreviation of the given SymbolType name"""
|
||||
if mtype is None:
|
||||
return ""
|
||||
|
||||
return mtype.name.lower()[:3]
|
||||
|
||||
|
||||
def get_cmakefiles_prefix(module: str) -> str:
|
||||
"""For the given .obj, get the "CMakeFiles/something.dir/" prefix.
|
||||
For lack of a better option, this is the library for this module."""
|
||||
if module.startswith("CMakeFiles"):
|
||||
return "/".join(module.split("/", 2)[:2]) + "/"
|
||||
|
||||
return module
|
||||
|
||||
|
||||
def truncate_module_name(prefix: str, module: str) -> str:
|
||||
"""Remove the CMakeFiles prefix and the .obj suffix for the given module.
|
||||
Input: CMakeFiles/lego1.dir/, CMakeFiles/lego1.dir/LEGO1/define.cpp.obj
|
||||
Output: LEGO1/define.cpp"""
|
||||
|
||||
if module.startswith(prefix):
|
||||
module = module[len(prefix) :]
|
||||
|
||||
if module.endswith(".obj"):
|
||||
module = module[:-4]
|
||||
|
||||
return module
|
||||
|
||||
|
||||
def avg_remove_outliers(entries: List[int]) -> int:
|
||||
"""Compute the average from this list of entries (addresses)
|
||||
after removing outlier values."""
|
||||
|
||||
if len(entries) == 1:
|
||||
return entries[0]
|
||||
|
||||
avg = statistics.mean(entries)
|
||||
sd = statistics.pstdev(entries)
|
||||
|
||||
return int(statistics.mean([e for e in entries if abs(e - avg) <= 2 * sd]))
|
||||
|
||||
|
||||
RoadmapRow = namedtuple(
|
||||
"RoadmapRow",
|
||||
[
|
||||
"orig_sect_ofs",
|
||||
"recomp_sect_ofs",
|
||||
"orig_addr",
|
||||
"recomp_addr",
|
||||
"displacement",
|
||||
"sym_type",
|
||||
"size",
|
||||
"name",
|
||||
"module",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class DeltaCollector:
|
||||
"""Reads each row of the results and aggregates information about the
|
||||
placement of each module."""
|
||||
|
||||
def __init__(self, match_type: str = "fun") -> None:
|
||||
# The displacement for each symbol from each module
|
||||
self.disp_map = {}
|
||||
|
||||
# Each address for each module
|
||||
self.addresses = {}
|
||||
|
||||
# The earliest address for each module
|
||||
self.earliest = {}
|
||||
|
||||
# String abbreviation for which symbol type we are checking
|
||||
self.match_type = "fun"
|
||||
|
||||
match_type = str(match_type).strip().lower()[:3]
|
||||
if match_type in ALLOWED_TYPE_ABBREVIATIONS:
|
||||
self.match_type = match_type
|
||||
|
||||
def read_row(self, row: RoadmapRow):
|
||||
if row.module is None:
|
||||
return
|
||||
|
||||
if row.sym_type != self.match_type:
|
||||
return
|
||||
|
||||
if row.orig_addr is not None:
|
||||
if row.module not in self.addresses:
|
||||
self.addresses[row.module] = []
|
||||
|
||||
self.addresses[row.module].append(row.orig_addr)
|
||||
|
||||
if row.orig_addr < self.earliest.get(row.module, 0xFFFFFFFFF):
|
||||
self.earliest[row.module] = row.orig_addr
|
||||
|
||||
if row.displacement is not None:
|
||||
if row.module not in self.disp_map:
|
||||
self.disp_map[row.module] = []
|
||||
|
||||
self.disp_map[row.module].append(row.displacement)
|
||||
|
||||
def iter_sorted(self) -> Iterator[Tuple[int, int]]:
|
||||
"""Compute the average address for each module, then generate them
|
||||
in ascending order."""
|
||||
avg_address = {
|
||||
mod: avg_remove_outliers(values) for mod, values in self.addresses.items()
|
||||
}
|
||||
for mod, avg in sorted(avg_address.items(), key=lambda x: x[1]):
|
||||
yield (avg, mod)
|
||||
|
||||
|
||||
def suggest_order(results: List[RoadmapRow], module_map: ModuleMap, match_type: str):
|
||||
"""Suggest the order of modules for CMakeLists.txt"""
|
||||
|
||||
dc = DeltaCollector(match_type)
|
||||
for row in results:
|
||||
dc.read_row(row)
|
||||
|
||||
# First, show the order of .obj files for the "CMake Modules"
|
||||
# Meaning: the modules where the .obj file begins with "CMakeFiles".
|
||||
# These are the libraries where we directly control the order.
|
||||
# The library name (from cvdump) doesn't make it obvious that these are
|
||||
# our libraries so we derive the name based on the CMakeFiles prefix.
|
||||
leftover_modules = set(module_map.get_all_cmake_modules())
|
||||
|
||||
# A little convoluted, but we want to take the first two tokens
|
||||
# of the string with '/' as the delimiter.
|
||||
# i.e. CMakeFiles/isle.dir/
|
||||
# The idea is to print exactly what appears in CMakeLists.txt.
|
||||
cmake_prefixes = sorted(set(get_cmakefiles_prefix(mod) for mod in leftover_modules))
|
||||
|
||||
# Save this off because we'll use it again later.
|
||||
computed_order = list(dc.iter_sorted())
|
||||
|
||||
for prefix in cmake_prefixes:
|
||||
print(prefix)
|
||||
|
||||
last_earliest = 0
|
||||
# Show modules ordered by the computed average of addresses
|
||||
for _, module in computed_order:
|
||||
if not module.startswith(prefix):
|
||||
continue
|
||||
|
||||
leftover_modules.remove(module)
|
||||
|
||||
avg_displacement = None
|
||||
displacements = dc.disp_map.get(module)
|
||||
if displacements is not None and len(displacements) > 0:
|
||||
avg_displacement = int(statistics.mean(displacements))
|
||||
|
||||
# Call attention to any modules where ordering by earliest
|
||||
# address is different from the computed order we display.
|
||||
earliest = dc.earliest.get(module)
|
||||
ooo_mark = "*" if earliest < last_earliest else " "
|
||||
last_earliest = earliest
|
||||
|
||||
code_file = truncate_module_name(prefix, module)
|
||||
print(f"0x{earliest:08x}{ooo_mark} {avg_displacement:10} {code_file}")
|
||||
|
||||
# These modules are included in the final binary (in some form) but
|
||||
# don't contribute any symbols of the type we are checking.
|
||||
# n.b. There could still be other modules that are part of
|
||||
# CMakeLists.txt but are not included in the pdb for whatever reason.
|
||||
# In other words: don't take the list we provide as the final word on
|
||||
# what should or should not be included.
|
||||
# This is merely a suggestion of the order.
|
||||
for module in leftover_modules:
|
||||
if not module.startswith(prefix):
|
||||
continue
|
||||
|
||||
# aligned with previous print
|
||||
code_file = truncate_module_name(prefix, module)
|
||||
print(f" no suggestion {code_file}")
|
||||
|
||||
print()
|
||||
|
||||
# Now display the order of all libaries in the final file.
|
||||
library_order = {}
|
||||
|
||||
for start, module in computed_order:
|
||||
lib = module_map.get_lib_for_module(module)
|
||||
if lib is None:
|
||||
lib = get_cmakefiles_prefix(module)
|
||||
|
||||
if start < library_order.get(lib, 0xFFFFFFFFF):
|
||||
library_order[lib] = start
|
||||
|
||||
print("Library order (average address shown):")
|
||||
for lib, start in sorted(library_order.items(), key=lambda x: x[1]):
|
||||
# Strip off any OS path for brevity
|
||||
if not lib.startswith("CMakeFiles"):
|
||||
lib = os.path.basename(lib)
|
||||
|
||||
print(f"{lib:40} {start:08x}")
|
||||
|
||||
|
||||
def print_text_report(results: List[RoadmapRow]):
|
||||
"""Print the result with original and recomp addresses."""
|
||||
for row in results:
|
||||
print(
|
||||
" ".join(
|
||||
[
|
||||
f"{or_blank(row.orig_sect_ofs):14}",
|
||||
f"{or_blank(row.recomp_sect_ofs):14}",
|
||||
f"{or_blank(row.displacement):>8}",
|
||||
f"{row.sym_type:3}",
|
||||
f"{or_blank(row.size):6}",
|
||||
or_blank(row.name),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def print_diff_report(results: List[RoadmapRow]):
|
||||
"""Print only entries where we have the recomp address.
|
||||
This is intended for generating a file to diff against.
|
||||
The recomp addresses are always changing so we hide those."""
|
||||
for row in results:
|
||||
if row.orig_addr is None or row.recomp_addr is None:
|
||||
continue
|
||||
|
||||
print(
|
||||
" ".join(
|
||||
[
|
||||
f"{or_blank(row.orig_sect_ofs):14}",
|
||||
f"{or_blank(row.displacement):>8}",
|
||||
f"{row.sym_type:3}",
|
||||
f"{or_blank(row.size):6}",
|
||||
or_blank(row.name),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def export_to_csv(csv_file: str, results: List[RoadmapRow]):
|
||||
with open(csv_file, "w+", encoding="utf-8") as f:
|
||||
f.write(
|
||||
"orig_sect_ofs,recomp_sect_ofs,orig_addr,recomp_addr,displacement,row_type,size,name,module\n"
|
||||
)
|
||||
for row in results:
|
||||
f.write(",".join(map(or_blank, row)))
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Show all addresses from original and recomp."
|
||||
)
|
||||
parser.add_argument(
|
||||
"original", metavar="original-binary", help="The original binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
|
||||
)
|
||||
parser.add_argument("--csv", metavar="<file>", help="If set, export to CSV")
|
||||
parser.add_argument(
|
||||
"--verbose", "-v", action="store_true", help="Show recomp addresses in output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--order",
|
||||
const="fun",
|
||||
nargs="?",
|
||||
type=str,
|
||||
help="Show suggested order of modules (using the specified symbol type)",
|
||||
)
|
||||
|
||||
(args, _) = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.original):
|
||||
parser.error(f"Original binary {args.original} does not exist")
|
||||
|
||||
if not os.path.isfile(args.recompiled):
|
||||
parser.error(f"Recompiled binary {args.recompiled} does not exist")
|
||||
|
||||
if not os.path.isfile(args.pdb):
|
||||
parser.error(f"Symbols PDB {args.pdb} does not exist")
|
||||
|
||||
if not os.path.isdir(args.decomp_dir):
|
||||
parser.error(f"Source directory {args.decomp_dir} does not exist")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
with IsleBin(args.original, find_str=True) as orig_bin, IsleBin(
|
||||
args.recompiled
|
||||
) as recomp_bin:
|
||||
engine = IsleCompare(orig_bin, recomp_bin, args.pdb, args.decomp_dir)
|
||||
|
||||
module_map = ModuleMap(args.pdb, recomp_bin)
|
||||
|
||||
def is_same_section(orig: int, recomp: int) -> bool:
|
||||
"""Compare the section name instead of the index.
|
||||
LEGO1.dll adds extra sections for some reason. (Smacker library?)"""
|
||||
|
||||
try:
|
||||
orig_name = orig_bin.sections[orig - 1].name
|
||||
recomp_name = recomp_bin.sections[recomp - 1].name
|
||||
return orig_name == recomp_name
|
||||
except IndexError:
|
||||
return False
|
||||
|
||||
def to_roadmap_row(match):
|
||||
orig_sect = None
|
||||
orig_ofs = None
|
||||
orig_sect_ofs = None
|
||||
recomp_sect = None
|
||||
recomp_ofs = None
|
||||
recomp_sect_ofs = None
|
||||
orig_addr = None
|
||||
recomp_addr = None
|
||||
displacement = None
|
||||
module_name = None
|
||||
|
||||
if match.recomp_addr is not None and recomp_bin.is_valid_vaddr(
|
||||
match.recomp_addr
|
||||
):
|
||||
if (module_ref := module_map.get_module(match.recomp_addr)) is not None:
|
||||
(_, module_name) = module_ref
|
||||
|
||||
row_type = match_type_abbreviation(match.compare_type)
|
||||
name = (
|
||||
repr(match.name)
|
||||
if match.compare_type == SymbolType.STRING
|
||||
else match.name
|
||||
)
|
||||
|
||||
if match.orig_addr is not None:
|
||||
orig_addr = match.orig_addr
|
||||
(orig_sect, orig_ofs) = orig_bin.get_relative_addr(match.orig_addr)
|
||||
orig_sect_ofs = f"{orig_sect:04}:{orig_ofs:08x}"
|
||||
|
||||
if match.recomp_addr is not None:
|
||||
recomp_addr = match.recomp_addr
|
||||
(recomp_sect, recomp_ofs) = recomp_bin.get_relative_addr(
|
||||
match.recomp_addr
|
||||
)
|
||||
recomp_sect_ofs = f"{recomp_sect:04}:{recomp_ofs:08x}"
|
||||
|
||||
if (
|
||||
orig_sect is not None
|
||||
and recomp_sect is not None
|
||||
and is_same_section(orig_sect, recomp_sect)
|
||||
):
|
||||
displacement = recomp_ofs - orig_ofs
|
||||
|
||||
return RoadmapRow(
|
||||
orig_sect_ofs,
|
||||
recomp_sect_ofs,
|
||||
orig_addr,
|
||||
recomp_addr,
|
||||
displacement,
|
||||
row_type,
|
||||
match.size,
|
||||
name,
|
||||
module_name,
|
||||
)
|
||||
|
||||
def roadmap_row_generator(matches):
|
||||
for match in matches:
|
||||
try:
|
||||
yield to_roadmap_row(match)
|
||||
except InvalidVirtualAddressError:
|
||||
# This is here to work around the fact that we have RVA
|
||||
# values (i.e. not real virtual addrs) in our compare db.
|
||||
pass
|
||||
|
||||
results = list(roadmap_row_generator(engine.get_all()))
|
||||
|
||||
if args.order is not None:
|
||||
suggest_order(results, module_map, args.order)
|
||||
return
|
||||
|
||||
if args.csv is None:
|
||||
if args.verbose:
|
||||
print("ORIG sections:")
|
||||
print_sections(orig_bin.sections)
|
||||
|
||||
print("RECOMP sections:")
|
||||
print_sections(recomp_bin.sections)
|
||||
|
||||
print_text_report(results)
|
||||
else:
|
||||
print_diff_report(results)
|
||||
|
||||
if args.csv is not None:
|
||||
export_to_csv(args.csv, results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,364 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
import argparse
|
||||
import struct
|
||||
from typing import Dict, List, NamedTuple, Optional, Set, Tuple
|
||||
|
||||
from isledecomp import Bin
|
||||
from isledecomp.compare import Compare as IsleCompare
|
||||
from isledecomp.compare.diff import CombinedDiffOutput
|
||||
from isledecomp.cvdump.symbols import SymbolsEntry
|
||||
import colorama
|
||||
|
||||
# pylint: disable=duplicate-code # misdetects a code duplication with reccmp
|
||||
|
||||
colorama.just_fix_windows_console()
|
||||
|
||||
CHECK_ICON = f"{colorama.Fore.GREEN}✓{colorama.Style.RESET_ALL}"
|
||||
SWAP_ICON = f"{colorama.Fore.YELLOW}⇄{colorama.Style.RESET_ALL}"
|
||||
ERROR_ICON = f"{colorama.Fore.RED}✗{colorama.Style.RESET_ALL}"
|
||||
UNCLEAR_ICON = f"{colorama.Fore.BLUE}?{colorama.Style.RESET_ALL}"
|
||||
|
||||
|
||||
STACK_ENTRY_REGEX = re.compile(
|
||||
r"(?P<register>e[sb]p)\s(?P<sign>[+-])\s(?P<offset>(0x)?[0-9a-f]+)(?![0-9a-f])"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StackSymbol:
|
||||
name: str
|
||||
data_type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class StackRegisterOffset:
|
||||
register: str
|
||||
offset: int
|
||||
symbol: Optional[StackSymbol] = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
first_part = (
|
||||
f"{self.register} + {self.offset:#04x}"
|
||||
if self.offset > 0
|
||||
else f"{self.register} - {-self.offset:#04x}"
|
||||
)
|
||||
second_part = f" {self.symbol.name}" if self.symbol else ""
|
||||
return first_part + second_part
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash(self.register) + self.offset
|
||||
|
||||
def copy(self) -> "StackRegisterOffset":
|
||||
return StackRegisterOffset(self.register, self.offset, self.symbol)
|
||||
|
||||
def __eq__(self, other: "StackRegisterOffset"):
|
||||
return self.register == other.register and self.offset == other.offset
|
||||
|
||||
|
||||
class StackPair(NamedTuple):
|
||||
orig: StackRegisterOffset
|
||||
recomp: StackRegisterOffset
|
||||
|
||||
|
||||
StackPairs = Set[StackPair]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Warnings:
|
||||
structural_mismatches_present: bool = False
|
||||
error_map_not_bijective: bool = False
|
||||
|
||||
|
||||
def extract_stack_offset_from_instruction(
|
||||
instruction: str,
|
||||
) -> StackRegisterOffset | None:
|
||||
match = STACK_ENTRY_REGEX.search(instruction)
|
||||
if not match:
|
||||
return None
|
||||
offset = int(match.group("sign") + match.group("offset"), 16)
|
||||
return StackRegisterOffset(match.group("register"), offset)
|
||||
|
||||
|
||||
def analyze_diff(
|
||||
diff: Dict[str, List[Tuple[str, ...]]], warnings: Warnings
|
||||
) -> StackPairs:
|
||||
stack_pairs: StackPairs = set()
|
||||
if "both" in diff:
|
||||
# get the matching stack entries
|
||||
for line in diff["both"]:
|
||||
# 0 = orig addr, 1 = instruction, 2 = reccmp addr
|
||||
instruction = line[1]
|
||||
|
||||
if match := extract_stack_offset_from_instruction(instruction):
|
||||
logging.debug("stack match: %s", match)
|
||||
# need a copy for recomp because we might add a debug symbol to it
|
||||
stack_pairs.add(StackPair(match, match.copy()))
|
||||
elif any(x in instruction for x in ["ebp", "esp"]):
|
||||
logging.debug("not a stack offset: %s", instruction)
|
||||
|
||||
else:
|
||||
orig = diff["orig"]
|
||||
recomp = diff["recomp"]
|
||||
if len(orig) != len(recomp):
|
||||
if orig:
|
||||
mismatch_location = f"orig={orig[0][0]}"
|
||||
else:
|
||||
mismatch_location = f"recomp={recomp[0][0]}"
|
||||
logging.error(
|
||||
"Structural mismatch at %s:\n%s",
|
||||
mismatch_location,
|
||||
print_structural_mismatch(orig, recomp),
|
||||
)
|
||||
warnings.structural_mismatches_present = True
|
||||
return set()
|
||||
|
||||
for orig_line, recomp_line in zip(orig, recomp):
|
||||
if orig_match := extract_stack_offset_from_instruction(orig_line[1]):
|
||||
recomp_match = extract_stack_offset_from_instruction(recomp_line[1])
|
||||
|
||||
if not recomp_match:
|
||||
logging.error(
|
||||
"Mismatching line structure at orig=%s:\n%s",
|
||||
orig_line[0],
|
||||
print_structural_mismatch(orig, recomp),
|
||||
)
|
||||
# not recoverable, whole block has a structural mismatch
|
||||
warnings.structural_mismatches_present = True
|
||||
return set()
|
||||
|
||||
stack_pair = StackPair(orig_match, recomp_match)
|
||||
|
||||
logging.debug(
|
||||
"stack match, wrong order: %s vs %s", stack_pair[0], stack_pair[1]
|
||||
)
|
||||
stack_pairs.add(stack_pair)
|
||||
|
||||
elif any(x in orig_line[1] for x in ["ebp", "esp"]):
|
||||
logging.debug("not a stack offset: %s", orig_line[1])
|
||||
|
||||
return stack_pairs
|
||||
|
||||
|
||||
def print_bijective_match(left: str, right: str, exact: bool):
|
||||
icon = CHECK_ICON if exact else SWAP_ICON
|
||||
print(f"{icon}{colorama.Style.RESET_ALL} {left}: {right}")
|
||||
|
||||
|
||||
def print_non_bijective_match(left: str, right: str):
|
||||
print(f"{ERROR_ICON} {left}: {right}")
|
||||
|
||||
|
||||
def print_structural_mismatch(
|
||||
orig: List[Tuple[str, ...]], recomp: List[Tuple[str, ...]]
|
||||
) -> str:
|
||||
orig_str = "\n".join(f"-{x[1]}" for x in orig) if orig else "-"
|
||||
recomp_str = "\n".join(f"+{x[1]}" for x in recomp) if recomp else "+"
|
||||
return f"{colorama.Fore.RED}{orig_str}\n{colorama.Fore.GREEN}{recomp_str}\n{colorama.Style.RESET_ALL}"
|
||||
|
||||
|
||||
def format_list_of_offsets(offsets: List[StackRegisterOffset]) -> str:
|
||||
return str([str(x) for x in offsets])
|
||||
|
||||
|
||||
def compare_function_stacks(udiff: CombinedDiffOutput, fn_symbol: SymbolsEntry):
|
||||
warnings = Warnings()
|
||||
|
||||
# consists of pairs (orig, recomp)
|
||||
# don't use a dict because we can have m:n relations
|
||||
stack_pairs: StackPairs = set()
|
||||
|
||||
for block in udiff:
|
||||
# block[0] is e.g. "@@ -0x10071662,60 +0x10031368,60 @@"
|
||||
for diff in block[1]:
|
||||
stack_pairs = stack_pairs.union(analyze_diff(diff, warnings))
|
||||
|
||||
# Note that the 'Frame Ptr Present' property is not relevant to the stack below `ebp`,
|
||||
# but only to entries above (i.e. the function arguments on the stack).
|
||||
# See also pdb_extraction.py.
|
||||
|
||||
stack_symbols: Dict[int, StackSymbol] = {}
|
||||
|
||||
for symbol in fn_symbol.stack_symbols:
|
||||
if symbol.symbol_type == "S_BPREL32":
|
||||
# convert hex to signed 32 bit integer
|
||||
hex_bytes = bytes.fromhex(symbol.location[1:-1])
|
||||
stack_offset = struct.unpack(">l", hex_bytes)[0]
|
||||
|
||||
stack_symbols[stack_offset] = StackSymbol(
|
||||
symbol.name,
|
||||
symbol.data_type,
|
||||
)
|
||||
|
||||
for _, recomp in stack_pairs:
|
||||
if recomp.register == "ebp":
|
||||
recomp.symbol = stack_symbols.get(recomp.offset)
|
||||
elif recomp.register == "esp":
|
||||
logging.debug(
|
||||
"Matching esp offsets to debug symbols is not implemented right now"
|
||||
)
|
||||
|
||||
print("\nOrdered by original stack (left=orig, right=recomp):")
|
||||
|
||||
all_orig_offsets = set(x.orig.offset for x in stack_pairs)
|
||||
|
||||
for orig_offset in sorted(all_orig_offsets):
|
||||
orig = next(x.orig for x in stack_pairs if x.orig.offset == orig_offset)
|
||||
recomps = [x.recomp for x in stack_pairs if x.orig == orig]
|
||||
|
||||
if len(recomps) == 1:
|
||||
recomp = recomps[0]
|
||||
print_bijective_match(str(orig), str(recomp), exact=orig == recomp)
|
||||
else:
|
||||
print_non_bijective_match(str(orig), format_list_of_offsets(recomps))
|
||||
warnings.error_map_not_bijective = True
|
||||
|
||||
# Show offsets from the debug symbols that we have not encountered in the diff
|
||||
all_recomp_offsets = set(x.recomp.offset for x in stack_pairs).union(
|
||||
stack_symbols.keys()
|
||||
)
|
||||
|
||||
print("\nOrdered by recomp stack (left=orig, right=recomp):")
|
||||
for recomp_offset in sorted(all_recomp_offsets):
|
||||
recomp = next(
|
||||
(x.recomp for x in stack_pairs if x.recomp.offset == recomp_offset), None
|
||||
)
|
||||
|
||||
if recomp is None:
|
||||
# The offset only appears in the debug symbols.
|
||||
# The legend below explains why this can happen.
|
||||
stack_offset = StackRegisterOffset(
|
||||
"ebp", recomp_offset, stack_symbols[recomp_offset]
|
||||
)
|
||||
print(f"{UNCLEAR_ICON} not seen: {stack_offset}")
|
||||
continue
|
||||
|
||||
origs = [x.orig for x in stack_pairs if x.recomp == recomp]
|
||||
|
||||
if len(origs) == 1:
|
||||
# 1:1 clean match
|
||||
print_bijective_match(str(origs[0]), str(recomp), origs[0] == recomp)
|
||||
else:
|
||||
print_non_bijective_match(format_list_of_offsets(origs), str(recomp))
|
||||
warnings.error_map_not_bijective = True
|
||||
|
||||
print(
|
||||
"\nLegend:\n"
|
||||
+ f"{SWAP_ICON} : This stack variable matches 1:1, but the order of variables is not correct.\n"
|
||||
+ f"{ERROR_ICON} : This stack variable matches multiple variables in the other binary.\n"
|
||||
+ f"{UNCLEAR_ICON} : This stack variable did not appear in the diff. It either matches or only appears in structural mismatches.\n"
|
||||
)
|
||||
|
||||
if warnings.error_map_not_bijective:
|
||||
print(
|
||||
"ERROR: The stack variables of original and recomp are not in a 1:1 correspondence, "
|
||||
+ "suggesting that the logic in the recomp is incorrect."
|
||||
)
|
||||
elif warnings.structural_mismatches_present:
|
||||
print(
|
||||
"WARNING: Original and recomp have at least one structural discrepancy, "
|
||||
+ "so the comparison of stack variables might be incomplete. "
|
||||
+ "The structural mismatches above need to be checked manually."
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
def virtual_address(value) -> int:
|
||||
"""Helper method for argparse, verbose parameter"""
|
||||
return int(value, 16)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
allow_abbrev=False,
|
||||
description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"original", metavar="original-binary", help="The original binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"address",
|
||||
metavar="<offset>",
|
||||
type=virtual_address,
|
||||
help="The original file's offset of the function to be analyzed",
|
||||
)
|
||||
|
||||
parser.set_defaults(loglevel=logging.INFO)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_const",
|
||||
const=logging.DEBUG,
|
||||
dest="loglevel",
|
||||
help="Print script debug information",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isfile(args.original):
|
||||
parser.error(f"Original binary {args.original} does not exist")
|
||||
|
||||
if not os.path.isfile(args.recompiled):
|
||||
parser.error(f"Recompiled binary {args.recompiled} does not exist")
|
||||
|
||||
if not os.path.isfile(args.pdb):
|
||||
parser.error(f"Symbols PDB {args.pdb} does not exist")
|
||||
|
||||
if not os.path.isdir(args.decomp_dir):
|
||||
parser.error(f"Source directory {args.decomp_dir} does not exist")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s")
|
||||
|
||||
with Bin(args.original, find_str=True) as origfile, Bin(
|
||||
args.recompiled
|
||||
) as recompfile:
|
||||
if args.loglevel != logging.DEBUG:
|
||||
# Mute logger events from compare engine
|
||||
logging.getLogger("isledecomp.compare.core").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL)
|
||||
|
||||
isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)
|
||||
|
||||
if args.loglevel == logging.DEBUG:
|
||||
isle_compare.debug = True
|
||||
|
||||
print()
|
||||
|
||||
match = isle_compare.compare_address(args.address)
|
||||
if match is None:
|
||||
print(f"Failed to find a match at address 0x{args.address:x}")
|
||||
return
|
||||
|
||||
assert match.udiff is not None
|
||||
|
||||
function_data = next(
|
||||
(
|
||||
y
|
||||
for y in isle_compare.cvdump_analysis.nodes
|
||||
if y.addr == match.recomp_addr
|
||||
),
|
||||
None,
|
||||
)
|
||||
assert function_data is not None
|
||||
assert function_data.symbol_entry is not None
|
||||
|
||||
compare_function_stacks(match.udiff, function_data.symbol_entry)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
@ -1,75 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import difflib
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
from isledecomp.lib import lib_path_join
|
||||
from isledecomp.utils import print_diff
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
allow_abbrev=False,
|
||||
description="Verify Exports: Compare the exports of two DLLs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"original", metavar="original-binary", help="The original binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-color", "-n", action="store_true", help="Do not color the output"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isfile(args.original):
|
||||
parser.error(f"Original binary file {args.original} does not exist")
|
||||
|
||||
if not os.path.isfile(args.recompiled):
|
||||
parser.error(f"Recompiled binary {args.recompiled} does not exist")
|
||||
|
||||
def get_exports(file):
|
||||
call = [lib_path_join("DUMPBIN.EXE"), "/EXPORTS"]
|
||||
|
||||
if os.name != "nt":
|
||||
call.insert(0, "wine")
|
||||
file = (
|
||||
subprocess.check_output(["winepath", "-w", file])
|
||||
.decode("utf-8")
|
||||
.strip()
|
||||
)
|
||||
|
||||
call.append(file)
|
||||
|
||||
raw = subprocess.check_output(call).decode("utf-8").split("\r\n")
|
||||
exports = []
|
||||
|
||||
start = False
|
||||
|
||||
for line in raw:
|
||||
if not start:
|
||||
if line == " ordinal hint name":
|
||||
start = True
|
||||
else:
|
||||
if line:
|
||||
exports.append(line[27 : line.rindex(" (")])
|
||||
elif exports:
|
||||
break
|
||||
|
||||
return exports
|
||||
|
||||
og_exp = get_exports(args.original)
|
||||
re_exp = get_exports(args.recompiled)
|
||||
|
||||
udiff = difflib.unified_diff(og_exp, re_exp)
|
||||
has_diff = print_diff(udiff, args.no_color)
|
||||
|
||||
return 1 if has_diff else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|