Add tool to automatically perform symbols cross referencing (#413)

I wanted to make this tool for a long time. Given the fact HD recently
became more dense into DRA, people who are willing to contribute to that
overlay should not have the burden of manually cross-referencing symbols
as I have been doing for the past weeks. Instead of creating a tutorial
I created a tool that cross-reference symbols between functions. The two
functions can be either between two different revisions of the game or
between two overlays. The latter is especially important to finally
remove all the duplicate functions.

I also took the opportunity to add some unit tests (finally!) in
`tools/tests` instead of creating the dedicated folder `tools/symbols`
with tools and test in it. Let me know if `tools/tests` is a pattern we
want to keep following for any future tool that needs to be tested.

I also added an assembly parser written in Python. As for today it lives
in `tools/symbols.py`. It is barebone but it can parse the majority of
assembly lines. I also added tests around it in case we want to expand
it or extract the parser into its own utility. I've been thinking to
ideally use it for a V2 of `find_duplicates.py` if we find the current
tool to be limiting.

### Demo

```
sotn-decomp$ ./tools/symbols.py cross asm/us/dra/nonmatchings/71830/func_80112BB0.s asm/hd/dra/nonmatchings/71830/func_80112BB0.s
D_80139824 = 0x801393F0;
D_800ACF7C = 0x800AD040;
D_800ACF7E = 0x800AD042;
D_80138FC8 = 0x80138B94;
D_80138FCA = 0x80138B96;
```

these symbols can be automatically copy&pasted into `symbols.hd.dra.txt`
or the relevant symbol list.
This commit is contained in:
Luciano Ciccariello 2023-07-31 20:14:57 +01:00 committed by GitHub
parent 595ba748f0
commit d17df87e1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 455 additions and 18 deletions

View File

@ -591,6 +591,8 @@ $(BUILD_DIR)/$(ASSETS_DIR)/%.png.o: $(ASSETS_DIR)/%.png
SHELL = /bin/bash -e -o pipefail
include tools/tools.mk
.PHONY: all, clean, format, check, expected
.PHONY: main, dra, ric, cen, dre, mad, no3, np3, nz0, st0, wrp, rwrp, tt_000
.PHONY: %_dirs

View File

@ -2,15 +2,27 @@
import argparse
import os
import re
import sys
parser = argparse.ArgumentParser(
description="Manage game symbols with various operations"
)
parser = argparse.ArgumentParser(description="Perform operations on game symbols")
parser.add_argument("--version", required=False, type=str, help="Game version")
subparsers = parser.add_subparsers(dest="command")
sort_parser = subparsers.add_parser(
"sort", description="Sort all the symbols by their offset"
"sort", description="Sort all the symbols of a given GNU LD script by their offset"
)
cross_parser = subparsers.add_parser(
"cross",
description="Cross-reference the symbols between two assembly files and print the result to stdout for GNU LD. Useful to cross-reference symbols between different overlays or game revisions. The assemblies must be identical.",
)
cross_parser.add_argument(
"ref",
help="Assembly source file to use as a base reference",
)
cross_parser.add_argument(
"to_cross",
help="Assembly source file to be cross-referenced to",
)
args = parser.parse_args()
@ -20,22 +32,23 @@ if args.version == None:
args.version = "us"
def sort_symbol_list(symbol_file_name):
def sort_symbols(syms):
offsets = []
with open(symbol_file_name) as symbol_file:
for line in symbol_file:
line = line.strip()
if not line:
continue
parts = line.split()
if len(parts) < 3:
continue
offset = parts[2].rstrip(";")
offsets.append((line, int(offset, 16)))
for line in syms:
parts = line.strip().split()
if len(parts) >= 3:
offset = int(parts[2].rstrip(";"), 16)
offsets.append((line, offset))
offsets.sort(key=lambda x: x[1])
return [line[0] for line in offsets]
# rewrite the same file with an ordered symbol list
def sort_symbols_from_file(symbol_file_name):
with open(symbol_file_name, "r") as symbol_file:
sorted_lines = sort_symbols(symbol_file)
with open(symbol_file_name, "w") as symbol_file:
for line, offset in offsets:
symbol_file.write(line + "\n")
symbol_file.writelines(sorted_lines)
def sort(base_path):
@ -49,9 +62,183 @@ def sort(base_path):
]
for symbol_file_name in [os.path.join(base_path, f) for f in filtered_files]:
sort_symbol_list(symbol_file_name)
sort_symbols_from_file(symbol_file_name)
# regex helper to match a hexadecimal string without the '0x'
def re_hex(name):
return f"(?P<{name}>[0-9A-F]+)"
# regex helper to parse splat's disassembler /* LOC VRAM VAL */ comments
def re_splat_line():
return f"/\\* {re_hex('LOC')} {re_hex('VRAM')} {re_hex('VAL')} \\*/"
# regex helper to match C-style identifiers
def re_ident(name):
return f"(?P<{name}>[a-zA-Z_][a-zA-Z0-9_]*)"
# regex helper to match assembly registers
def re_reg(name):
return f"(?P<{name}>\$[0-9a-z]+)"
# regex helper to match the two %lo and %hi functions
re_func = r"(?P<FUNC>\%(hi|lo))"
# all the regex patterns supported by the MIPS assembly parser
patterns = [
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_func}\({re_ident('SYM')}\)\({re_reg('IMM')}\)",
["LOC", "VRAM", "VAL", "OP", "DST", "FUNC", "SYM", "IMM"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_func}\({re_ident('SYM')}\)",
["LOC", "VRAM", "VAL", "OP", "DST", "FUNC", "SYM"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')},\\s+{re_reg('RIGHT')}",
["LOC", "VRAM", "VAL", "OP", "DST", "LEFT", "RIGHT"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')},\\s+{re_func}\({re_ident('SYM')}\)\({re_reg('IMM')}\)",
["LOC", "VRAM", "VAL", "OP", "DST", "LEFT", "FUNC", "SYM", "IMM"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')},\\s+{re_func}\({re_ident('SYM')}\)",
["LOC", "VRAM", "VAL", "OP", "DST", "LEFT", "FUNC", "SYM"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')}",
["LOC", "VRAM", "VAL", "OP", "DST", "LEFT"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+\.{re_ident('LABEL')}",
["LOC", "VRAM", "VAL", "OP", "LABEL"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+\.{re_ident('LABEL')}",
["LOC", "VRAM", "VAL", "OP", "DST", "LABEL"],
),
(
f"{re_splat_line()}\\s+{re_ident('OP')}$",
["LOC", "VRAM", "VAL", "OP"],
),
(r"glabel (?P<FUNC_NAME>\w+)", ["FUNC_NAME"]),
]
# tokenize a single line of MIPS assembly code
def asm_tokenize_line(line):
for pattern, token_names in patterns:
match = re.match(pattern, line)
if match:
tokens = match.groupdict()
return {key: tokens[key] for key in token_names}
return None
# get a dictionary of all the non-matching and cross-referenced symbols
def get_non_matching_symbols(asm_ref, asm_cross):
def is_value_equal(a, b, key):
if key not in a and key not in b:
return True
if key not in a or key not in b:
return False
return a[key] == b[key]
def get_imm_addr(raw_val):
return int(raw_val[2:4] + raw_val[0:2], 16)
def get_hi_addr(op, raw_val):
if op == "lui":
return get_imm_addr(raw_val) << 16
else:
print(
f"CROSS-REFERENCING ERROR: %hi did not expect op '{op}'.",
file=sys.stderr,
)
print(f"affected line: {line_cross}'.", file=sys.stderr)
assert op == "lui"
def get_lo_addr(op, raw_val):
imm = get_imm_addr(raw_val)
if imm < 32767:
return imm
return imm - 0x10000
ref_line_count = len(asm_ref)
cross_line_count = len(asm_cross)
if ref_line_count != cross_line_count:
return "fail", []
syms = dict()
prev_instr_hi = False
cross_off = 0
for i in range(0, ref_line_count):
line_ref = asm_ref[i]
line_cross = asm_cross[i]
if line_ref == line_cross:
continue # if lines are identical, skip and continue
tokens_ref = asm_tokenize_line(line_ref)
tokens_cross = asm_tokenize_line(line_cross)
if tokens_ref == tokens_cross:
continue # if tokens are identical, skip and continue
if tokens_ref == None or tokens_cross == None:
return "fail", [] # token mis-match, functions are different
if is_value_equal(tokens_ref, tokens_cross, "OP") == False:
return "fail", [] # if op code is not the same, functions are different
if is_value_equal(tokens_ref, tokens_cross, "SYM") == True:
continue # if a symbol is found and it is the same then continue
if "SYM" not in tokens_ref:
continue # instruction do not use any symbol, skip and continue
# If arriving here it should be the only case where cross-referencing
# between two symbols should happen.
sym = tokens_ref["SYM"]
if sym.startswith("jpt_"):
continue # actively ignore jump tables
op = tokens_cross["OP"]
func = tokens_cross["FUNC"]
raw_val = tokens_cross["VAL"]
if prev_instr_hi and func == "%lo":
prev_instr_hi = False
cross_off += get_lo_addr(op, raw_val)
syms[sym] = cross_off
elif func == "%hi":
prev_instr_hi = True
cross_off = get_hi_addr(op, raw_val)
else:
# Do nothing. There are instances where between a %hi and a %lo
# some instructions can be found.
continue
return "ok", syms
def cross(asm_reference_file_name, asm_to_cross_file_name):
with open(asm_reference_file_name, "r") as asm_ref_file:
with open(asm_to_cross_file_name, "r") as asm_cross_file:
err, syms = get_non_matching_symbols(
asm_ref_file.readlines(), asm_cross_file.readlines()
)
if err != "ok":
print(
"assemblies too different to be cross-referenced automatically",
file=sys.stderr,
)
return
# print symbol list in GNU LD style
for sym in syms:
print(f"{sym} = 0x{syms[sym]:08X};")
if __name__ == "__main__":
if args.command == "sort":
sort("config/")
elif args.command == "cross":
cross(args.ref, args.to_cross)

246
tools/symbols_test.py Normal file
View File

@ -0,0 +1,246 @@
#!/usr/bin/env python3
import os
import sys
import unittest
sys.path.append(os.getcwd())
from tools.symbols import asm_tokenize_line, get_non_matching_symbols, sort_symbols
class TestSortSymbols(unittest.TestCase):
def test_sort_symbol_list_based_on_their_offset(self):
sorted = sort_symbols(
["sym2 = 0x5678; // some comment", "sym3 = 0x9ABC;", "sym1 = 0x1234;"]
)
self.assertEqual(
sorted,
["sym1 = 0x1234;", "sym2 = 0x5678; // some comment", "sym3 = 0x9ABC;"],
)
class TestTokenizeAssembly(unittest.TestCase):
def test_tokenize_instruction_with_no_parameters(self):
asm = "/* 0 1 2 */ nop"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "0",
"VRAM": "1",
"VAL": "2",
"OP": "nop",
},
)
def test_tokenize_instruction_with_left(self):
asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72DEC",
"VRAM": "80112DEC",
"VAL": "28348424",
"OP": "addiu",
"DST": "$a0",
"LEFT": "$a1",
},
)
def test_tokenize_instruction_with_symbol(self):
asm = "/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72BF4",
"VRAM": "80112BF4",
"VAL": "0780023C",
"OP": "lui",
"DST": "$v0",
"FUNC": "%hi",
"SYM": "g_MyStruct_field",
},
)
def test_tokenize_instruction_with_symbol_offset(self):
asm = "/* 72BB0 80112BB0 D0FFBD27 */ lhu $v0, %lo(g_MyStruct_field)($v0)"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72BB0",
"VRAM": "80112BB0",
"VAL": "D0FFBD27",
"OP": "lhu",
"DST": "$v0",
"FUNC": "%lo",
"SYM": "g_MyStruct_field",
"IMM": "$v0",
},
)
def test_tokenize_instruction_with_left_reg_and_right_sym(self):
asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1, %lo(PLAYER_animFrameIdx)"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72DEC",
"VRAM": "80112DEC",
"VAL": "28348424",
"OP": "addiu",
"DST": "$a0",
"LEFT": "$a1",
"FUNC": "%lo",
"SYM": "PLAYER_animFrameIdx",
},
)
def test_tokenize_instruction_with_left_reg_and_right_sym_with_imm(self):
asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1, %lo(PLAYER_animFrameIdx)($at)"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72DEC",
"VRAM": "80112DEC",
"VAL": "28348424",
"OP": "addiu",
"DST": "$a0",
"LEFT": "$a1",
"FUNC": "%lo",
"SYM": "PLAYER_animFrameIdx",
"IMM": "$at",
},
)
def test_tokenize_instruction_with_left_and_right_syms(self):
asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1, $a2"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72DEC",
"VRAM": "80112DEC",
"VAL": "28348424",
"OP": "addiu",
"DST": "$a0",
"LEFT": "$a1",
"RIGHT": "$a2",
},
)
def test_tokenize_instruction_with_label(self):
asm = "/* 72DEC 80112DEC 28348424 */ jmp .MY_LABEL"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72DEC",
"VRAM": "80112DEC",
"VAL": "28348424",
"OP": "jmp",
"LABEL": "MY_LABEL",
},
)
def test_tokenize_instruction_with_dst_and_label(self):
asm = "/* 72DEC 80112DEC 28348424 */ bne $v0, .MY_LABEL"
tokens = asm_tokenize_line(asm)
self.assertEqual(
tokens,
{
"LOC": "72DEC",
"VRAM": "80112DEC",
"VAL": "28348424",
"OP": "bne",
"DST": "$v0",
"LABEL": "MY_LABEL",
},
)
class TestCrossReferenceSymbols(unittest.TestCase):
def test_get_non_matching_symbols_for_the_same_function(self):
asm_sample_reference = [
".some asm_directive /* with some comments */",
"",
"glabel func_my_func_name_reference",
"/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30",
"/* 72BBC 80112BBC 58000234 */ ori $v0, $zero, 0xAAA",
"/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)",
"/* 72BF8 80112BF8 642F4294 */ lhu $v0, %lo(g_MyStruct_field)($v0)",
"/* 72C04 80112C04 09004014 */ bnez $v0, .L80112C2C",
"/* 72DC8 80112DC8 1480023C */ lui $v0, %hi(D_80139824)",
"/* 72BA0 80112BA0 21082200 */ addu $at, $at, $v0",
"/* 72DCC 80112DCC 2498428C */ lhu $v0, %lo(D_80139824)($v0)",
".L80112DE8:",
"/* 72DE8 80112DE8 0780043C */ lui $a0, %hi(PLAYER_animFrameIdx)",
"/* 72DEC 80112DEC 28348424 */ addiu $a0, $a0, %lo(PLAYER_animFrameIdx)",
"/* 73140 80113140 0800E003 */ jr $ra",
"/* 73144 80113144 00000000 */ nop",
]
asm_sample_to_cross = [
".some asm_directive /* with some comments */",
"",
"glabel func_my_func_name_cross",
"/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30",
"/* 72BBC 80112BBC 58000234 */ ori $v0, $zero, 0xBBB",
"/* 728E4 801128E4 0780023C */ lui $v0, %hi(D_80012354)",
"/* 728E8 801128E8 642F4294 */ lhu $v0, %lo(D_80012354)($v0)",
"/* 72D1C 80112D1C 04004014 */ bnez $v0, .L80112D30",
"/* 72B9C 80112B9C 0B80013C */ lui $at, %hi(D_800AD040)",
"/* 72BA0 80112BA0 21082200 */ addu $at, $at, $v0",
"/* 72BA4 80112BA4 40D02394 */ lhu $zero, %lo(D_800AD040)($at)",
".L80112AD8:",
"/* 72AD8 80112AD8 0780043C */ lui $a0, %hi(PLAYER_animFrameIdx)",
"/* 72ADC 80112ADC 28348424 */ addiu $a0, $a0, %lo(PLAYER_animFrameIdx)",
"/* 73140 80113140 0800E003 */ jr $ra",
"/* 73144 80113144 00000000 */ nop",
]
status, symbols = get_non_matching_symbols(
asm_sample_reference, asm_sample_to_cross
)
self.assertEqual(status, "ok")
self.assertEqual(
symbols,
{"g_MyStruct_field": 0x80072F64, "D_80139824": 0x800AD040},
)
def test_fail_if_the_line_count_between_the_sources_is_different(self):
status, _ = get_non_matching_symbols(["a", "b"], ["a", "b", "c"])
self.assertEqual(status, "fail")
def test_fail_if_the_sources_are_too_different_between_each_other(self):
asm_sample_reference = [
".some asm_directive /* with some comments */",
"",
"glabel func_my_func_name",
"/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30",
"/* 72BBC 80112BBC 58000234 */ addiu $v0, $zero, 0xAAA",
"/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)",
"/* 72BF8 80112BF8 642F4294 */ lhu $v0, %lo(g_MyStruct_field)($v0)",
"/* 72C04 80112C04 09004014 */ bnez $v0, .L80112C2C",
"/* 73140 80113140 0800E003 */ jr $ra",
"/* 73144 80113144 00000000 */ nop",
]
asm_sample_to_cross = [
".some asm_directive /* with some comments */",
"",
"glabel func_my_func_name",
"/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30",
"/* 72BBC 80112BBC 58000234 */ addiu $v0, $zero, 0xAAA",
"/* 72C04 80112C04 09004014 */ bnez $v0, .L80112C2C",
"/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)",
"/* 72BF8 80112BF8 642F4294 */ lhu $v0, %lo(g_MyStruct_field)($v0)",
"/* 73140 80113140 0800E003 */ jr $ra",
"/* 73144 80113144 00000000 */ nop",
]
status, _ = get_non_matching_symbols(asm_sample_reference, asm_sample_to_cross)
self.assertEqual(status, "fail")
if __name__ == "__main__":
unittest.main()

2
tools/tools.mk Normal file
View File

@ -0,0 +1,2 @@
test:
python3 tools/symbols_test.py