From d17df87e1abc5f53fbee0b47f8b54146d50f2e12 Mon Sep 17 00:00:00 2001 From: Luciano Ciccariello Date: Mon, 31 Jul 2023 20:14:57 +0100 Subject: [PATCH] Add tool to automatically perform symbols cross referencing (#413) I wanted to make this tool for a long time. Given the fact HD recently became more dense into DRA, people who are willing to contribute to that overlay should not have the burden of manually cross-referencing symbols as I have been doing for the past weeks. Instead of creating a tutorial I created a tool that cross-reference symbols between functions. The two functions can be either between two different revisions of the game or between two overlays. The latter is especially important to finally remove all the duplicate functions. I also took the opportunity to add some unit tests (finally!) in `tools/tests` instead of creating the dedicated folder `tools/symbols` with tools and test in it. Let me know if `tools/tests` is a pattern we want to keep following for any future tool that needs to be tested. I also added an assembly parser written in Python. As for today it lives in `tools/symbols.py`. It is barebone but it can parse the majority of assembly lines. I also added tests around it in case we want to expand it or extract the parser into its own utility. I've been thinking to ideally use it for a V2 of `find_duplicates.py` if we find the current tool to be limiting. ### Demo ``` sotn-decomp$ ./tools/symbols.py cross asm/us/dra/nonmatchings/71830/func_80112BB0.s asm/hd/dra/nonmatchings/71830/func_80112BB0.s D_80139824 = 0x801393F0; D_800ACF7C = 0x800AD040; D_800ACF7E = 0x800AD042; D_80138FC8 = 0x80138B94; D_80138FCA = 0x80138B96; ``` these symbols can be automatically copy&pasted into `symbols.hd.dra.txt` or the relevant symbol list. --- Makefile | 2 + tools/symbols.py | 223 ++++++++++++++++++++++++++++++++++---- tools/symbols_test.py | 246 ++++++++++++++++++++++++++++++++++++++++++ tools/tools.mk | 2 + 4 files changed, 455 insertions(+), 18 deletions(-) create mode 100644 tools/symbols_test.py create mode 100644 tools/tools.mk diff --git a/Makefile b/Makefile index 3ab20c699..8ba1d62cc 100644 --- a/Makefile +++ b/Makefile @@ -591,6 +591,8 @@ $(BUILD_DIR)/$(ASSETS_DIR)/%.png.o: $(ASSETS_DIR)/%.png SHELL = /bin/bash -e -o pipefail +include tools/tools.mk + .PHONY: all, clean, format, check, expected .PHONY: main, dra, ric, cen, dre, mad, no3, np3, nz0, st0, wrp, rwrp, tt_000 .PHONY: %_dirs diff --git a/tools/symbols.py b/tools/symbols.py index 1aa5e333f..a7ac964e0 100755 --- a/tools/symbols.py +++ b/tools/symbols.py @@ -2,15 +2,27 @@ import argparse import os +import re +import sys -parser = argparse.ArgumentParser( - description="Manage game symbols with various operations" -) +parser = argparse.ArgumentParser(description="Perform operations on game symbols") parser.add_argument("--version", required=False, type=str, help="Game version") subparsers = parser.add_subparsers(dest="command") sort_parser = subparsers.add_parser( - "sort", description="Sort all the symbols by their offset" + "sort", description="Sort all the symbols of a given GNU LD script by their offset" +) +cross_parser = subparsers.add_parser( + "cross", + description="Cross-reference the symbols between two assembly files and print the result to stdout for GNU LD. Useful to cross-reference symbols between different overlays or game revisions. The assemblies must be identical.", +) +cross_parser.add_argument( + "ref", + help="Assembly source file to use as a base reference", +) +cross_parser.add_argument( + "to_cross", + help="Assembly source file to be cross-referenced to", ) args = parser.parse_args() @@ -20,22 +32,23 @@ if args.version == None: args.version = "us" -def sort_symbol_list(symbol_file_name): +def sort_symbols(syms): offsets = [] - with open(symbol_file_name) as symbol_file: - for line in symbol_file: - line = line.strip() - if not line: - continue - parts = line.split() - if len(parts) < 3: - continue - offset = parts[2].rstrip(";") - offsets.append((line, int(offset, 16))) + for line in syms: + parts = line.strip().split() + if len(parts) >= 3: + offset = int(parts[2].rstrip(";"), 16) + offsets.append((line, offset)) offsets.sort(key=lambda x: x[1]) + return [line[0] for line in offsets] + + +# rewrite the same file with an ordered symbol list +def sort_symbols_from_file(symbol_file_name): + with open(symbol_file_name, "r") as symbol_file: + sorted_lines = sort_symbols(symbol_file) with open(symbol_file_name, "w") as symbol_file: - for line, offset in offsets: - symbol_file.write(line + "\n") + symbol_file.writelines(sorted_lines) def sort(base_path): @@ -49,9 +62,183 @@ def sort(base_path): ] for symbol_file_name in [os.path.join(base_path, f) for f in filtered_files]: - sort_symbol_list(symbol_file_name) + sort_symbols_from_file(symbol_file_name) + + +# regex helper to match a hexadecimal string without the '0x' +def re_hex(name): + return f"(?P<{name}>[0-9A-F]+)" + + +# regex helper to parse splat's disassembler /* LOC VRAM VAL */ comments +def re_splat_line(): + return f"/\\* {re_hex('LOC')} {re_hex('VRAM')} {re_hex('VAL')} \\*/" + + +# regex helper to match C-style identifiers +def re_ident(name): + return f"(?P<{name}>[a-zA-Z_][a-zA-Z0-9_]*)" + + +# regex helper to match assembly registers +def re_reg(name): + return f"(?P<{name}>\$[0-9a-z]+)" + + +# regex helper to match the two %lo and %hi functions +re_func = r"(?P\%(hi|lo))" + + +# all the regex patterns supported by the MIPS assembly parser +patterns = [ + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_func}\({re_ident('SYM')}\)\({re_reg('IMM')}\)", + ["LOC", "VRAM", "VAL", "OP", "DST", "FUNC", "SYM", "IMM"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_func}\({re_ident('SYM')}\)", + ["LOC", "VRAM", "VAL", "OP", "DST", "FUNC", "SYM"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')},\\s+{re_reg('RIGHT')}", + ["LOC", "VRAM", "VAL", "OP", "DST", "LEFT", "RIGHT"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')},\\s+{re_func}\({re_ident('SYM')}\)\({re_reg('IMM')}\)", + ["LOC", "VRAM", "VAL", "OP", "DST", "LEFT", "FUNC", "SYM", "IMM"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')},\\s+{re_func}\({re_ident('SYM')}\)", + ["LOC", "VRAM", "VAL", "OP", "DST", "LEFT", "FUNC", "SYM"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+{re_reg('LEFT')}", + ["LOC", "VRAM", "VAL", "OP", "DST", "LEFT"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+\.{re_ident('LABEL')}", + ["LOC", "VRAM", "VAL", "OP", "LABEL"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}\\s+{re_reg('DST')},\\s+\.{re_ident('LABEL')}", + ["LOC", "VRAM", "VAL", "OP", "DST", "LABEL"], + ), + ( + f"{re_splat_line()}\\s+{re_ident('OP')}$", + ["LOC", "VRAM", "VAL", "OP"], + ), + (r"glabel (?P\w+)", ["FUNC_NAME"]), +] + + +# tokenize a single line of MIPS assembly code +def asm_tokenize_line(line): + for pattern, token_names in patterns: + match = re.match(pattern, line) + if match: + tokens = match.groupdict() + return {key: tokens[key] for key in token_names} + return None + + +# get a dictionary of all the non-matching and cross-referenced symbols +def get_non_matching_symbols(asm_ref, asm_cross): + def is_value_equal(a, b, key): + if key not in a and key not in b: + return True + if key not in a or key not in b: + return False + return a[key] == b[key] + + def get_imm_addr(raw_val): + return int(raw_val[2:4] + raw_val[0:2], 16) + + def get_hi_addr(op, raw_val): + if op == "lui": + return get_imm_addr(raw_val) << 16 + else: + print( + f"CROSS-REFERENCING ERROR: %hi did not expect op '{op}'.", + file=sys.stderr, + ) + print(f"affected line: {line_cross}'.", file=sys.stderr) + assert op == "lui" + + def get_lo_addr(op, raw_val): + imm = get_imm_addr(raw_val) + if imm < 32767: + return imm + return imm - 0x10000 + + ref_line_count = len(asm_ref) + cross_line_count = len(asm_cross) + if ref_line_count != cross_line_count: + return "fail", [] + + syms = dict() + prev_instr_hi = False + cross_off = 0 + for i in range(0, ref_line_count): + line_ref = asm_ref[i] + line_cross = asm_cross[i] + if line_ref == line_cross: + continue # if lines are identical, skip and continue + tokens_ref = asm_tokenize_line(line_ref) + tokens_cross = asm_tokenize_line(line_cross) + if tokens_ref == tokens_cross: + continue # if tokens are identical, skip and continue + if tokens_ref == None or tokens_cross == None: + return "fail", [] # token mis-match, functions are different + if is_value_equal(tokens_ref, tokens_cross, "OP") == False: + return "fail", [] # if op code is not the same, functions are different + if is_value_equal(tokens_ref, tokens_cross, "SYM") == True: + continue # if a symbol is found and it is the same then continue + if "SYM" not in tokens_ref: + continue # instruction do not use any symbol, skip and continue + + # If arriving here it should be the only case where cross-referencing + # between two symbols should happen. + sym = tokens_ref["SYM"] + if sym.startswith("jpt_"): + continue # actively ignore jump tables + + op = tokens_cross["OP"] + func = tokens_cross["FUNC"] + raw_val = tokens_cross["VAL"] + if prev_instr_hi and func == "%lo": + prev_instr_hi = False + cross_off += get_lo_addr(op, raw_val) + syms[sym] = cross_off + elif func == "%hi": + prev_instr_hi = True + cross_off = get_hi_addr(op, raw_val) + else: + # Do nothing. There are instances where between a %hi and a %lo + # some instructions can be found. + continue + return "ok", syms + + +def cross(asm_reference_file_name, asm_to_cross_file_name): + with open(asm_reference_file_name, "r") as asm_ref_file: + with open(asm_to_cross_file_name, "r") as asm_cross_file: + err, syms = get_non_matching_symbols( + asm_ref_file.readlines(), asm_cross_file.readlines() + ) + if err != "ok": + print( + "assemblies too different to be cross-referenced automatically", + file=sys.stderr, + ) + return + + # print symbol list in GNU LD style + for sym in syms: + print(f"{sym} = 0x{syms[sym]:08X};") if __name__ == "__main__": if args.command == "sort": sort("config/") + elif args.command == "cross": + cross(args.ref, args.to_cross) diff --git a/tools/symbols_test.py b/tools/symbols_test.py new file mode 100644 index 000000000..42ae21cbc --- /dev/null +++ b/tools/symbols_test.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 + +import os +import sys +import unittest + +sys.path.append(os.getcwd()) +from tools.symbols import asm_tokenize_line, get_non_matching_symbols, sort_symbols + + +class TestSortSymbols(unittest.TestCase): + def test_sort_symbol_list_based_on_their_offset(self): + sorted = sort_symbols( + ["sym2 = 0x5678; // some comment", "sym3 = 0x9ABC;", "sym1 = 0x1234;"] + ) + self.assertEqual( + sorted, + ["sym1 = 0x1234;", "sym2 = 0x5678; // some comment", "sym3 = 0x9ABC;"], + ) + + +class TestTokenizeAssembly(unittest.TestCase): + def test_tokenize_instruction_with_no_parameters(self): + asm = "/* 0 1 2 */ nop" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "0", + "VRAM": "1", + "VAL": "2", + "OP": "nop", + }, + ) + + def test_tokenize_instruction_with_left(self): + asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72DEC", + "VRAM": "80112DEC", + "VAL": "28348424", + "OP": "addiu", + "DST": "$a0", + "LEFT": "$a1", + }, + ) + + def test_tokenize_instruction_with_symbol(self): + asm = "/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72BF4", + "VRAM": "80112BF4", + "VAL": "0780023C", + "OP": "lui", + "DST": "$v0", + "FUNC": "%hi", + "SYM": "g_MyStruct_field", + }, + ) + + def test_tokenize_instruction_with_symbol_offset(self): + asm = "/* 72BB0 80112BB0 D0FFBD27 */ lhu $v0, %lo(g_MyStruct_field)($v0)" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72BB0", + "VRAM": "80112BB0", + "VAL": "D0FFBD27", + "OP": "lhu", + "DST": "$v0", + "FUNC": "%lo", + "SYM": "g_MyStruct_field", + "IMM": "$v0", + }, + ) + + def test_tokenize_instruction_with_left_reg_and_right_sym(self): + asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1, %lo(PLAYER_animFrameIdx)" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72DEC", + "VRAM": "80112DEC", + "VAL": "28348424", + "OP": "addiu", + "DST": "$a0", + "LEFT": "$a1", + "FUNC": "%lo", + "SYM": "PLAYER_animFrameIdx", + }, + ) + + def test_tokenize_instruction_with_left_reg_and_right_sym_with_imm(self): + asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1, %lo(PLAYER_animFrameIdx)($at)" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72DEC", + "VRAM": "80112DEC", + "VAL": "28348424", + "OP": "addiu", + "DST": "$a0", + "LEFT": "$a1", + "FUNC": "%lo", + "SYM": "PLAYER_animFrameIdx", + "IMM": "$at", + }, + ) + + def test_tokenize_instruction_with_left_and_right_syms(self): + asm = "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a1, $a2" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72DEC", + "VRAM": "80112DEC", + "VAL": "28348424", + "OP": "addiu", + "DST": "$a0", + "LEFT": "$a1", + "RIGHT": "$a2", + }, + ) + + def test_tokenize_instruction_with_label(self): + asm = "/* 72DEC 80112DEC 28348424 */ jmp .MY_LABEL" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72DEC", + "VRAM": "80112DEC", + "VAL": "28348424", + "OP": "jmp", + "LABEL": "MY_LABEL", + }, + ) + + def test_tokenize_instruction_with_dst_and_label(self): + asm = "/* 72DEC 80112DEC 28348424 */ bne $v0, .MY_LABEL" + tokens = asm_tokenize_line(asm) + self.assertEqual( + tokens, + { + "LOC": "72DEC", + "VRAM": "80112DEC", + "VAL": "28348424", + "OP": "bne", + "DST": "$v0", + "LABEL": "MY_LABEL", + }, + ) + + +class TestCrossReferenceSymbols(unittest.TestCase): + def test_get_non_matching_symbols_for_the_same_function(self): + asm_sample_reference = [ + ".some asm_directive /* with some comments */", + "", + "glabel func_my_func_name_reference", + "/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30", + "/* 72BBC 80112BBC 58000234 */ ori $v0, $zero, 0xAAA", + "/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)", + "/* 72BF8 80112BF8 642F4294 */ lhu $v0, %lo(g_MyStruct_field)($v0)", + "/* 72C04 80112C04 09004014 */ bnez $v0, .L80112C2C", + "/* 72DC8 80112DC8 1480023C */ lui $v0, %hi(D_80139824)", + "/* 72BA0 80112BA0 21082200 */ addu $at, $at, $v0", + "/* 72DCC 80112DCC 2498428C */ lhu $v0, %lo(D_80139824)($v0)", + ".L80112DE8:", + "/* 72DE8 80112DE8 0780043C */ lui $a0, %hi(PLAYER_animFrameIdx)", + "/* 72DEC 80112DEC 28348424 */ addiu $a0, $a0, %lo(PLAYER_animFrameIdx)", + "/* 73140 80113140 0800E003 */ jr $ra", + "/* 73144 80113144 00000000 */ nop", + ] + asm_sample_to_cross = [ + ".some asm_directive /* with some comments */", + "", + "glabel func_my_func_name_cross", + "/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30", + "/* 72BBC 80112BBC 58000234 */ ori $v0, $zero, 0xBBB", + "/* 728E4 801128E4 0780023C */ lui $v0, %hi(D_80012354)", + "/* 728E8 801128E8 642F4294 */ lhu $v0, %lo(D_80012354)($v0)", + "/* 72D1C 80112D1C 04004014 */ bnez $v0, .L80112D30", + "/* 72B9C 80112B9C 0B80013C */ lui $at, %hi(D_800AD040)", + "/* 72BA0 80112BA0 21082200 */ addu $at, $at, $v0", + "/* 72BA4 80112BA4 40D02394 */ lhu $zero, %lo(D_800AD040)($at)", + ".L80112AD8:", + "/* 72AD8 80112AD8 0780043C */ lui $a0, %hi(PLAYER_animFrameIdx)", + "/* 72ADC 80112ADC 28348424 */ addiu $a0, $a0, %lo(PLAYER_animFrameIdx)", + "/* 73140 80113140 0800E003 */ jr $ra", + "/* 73144 80113144 00000000 */ nop", + ] + status, symbols = get_non_matching_symbols( + asm_sample_reference, asm_sample_to_cross + ) + self.assertEqual(status, "ok") + self.assertEqual( + symbols, + {"g_MyStruct_field": 0x80072F64, "D_80139824": 0x800AD040}, + ) + + def test_fail_if_the_line_count_between_the_sources_is_different(self): + status, _ = get_non_matching_symbols(["a", "b"], ["a", "b", "c"]) + self.assertEqual(status, "fail") + + def test_fail_if_the_sources_are_too_different_between_each_other(self): + asm_sample_reference = [ + ".some asm_directive /* with some comments */", + "", + "glabel func_my_func_name", + "/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30", + "/* 72BBC 80112BBC 58000234 */ addiu $v0, $zero, 0xAAA", + "/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)", + "/* 72BF8 80112BF8 642F4294 */ lhu $v0, %lo(g_MyStruct_field)($v0)", + "/* 72C04 80112C04 09004014 */ bnez $v0, .L80112C2C", + "/* 73140 80113140 0800E003 */ jr $ra", + "/* 73144 80113144 00000000 */ nop", + ] + asm_sample_to_cross = [ + ".some asm_directive /* with some comments */", + "", + "glabel func_my_func_name", + "/* 72BB0 80112BB0 D0FFBD27 */ addiu $sp, $sp, -0x30", + "/* 72BBC 80112BBC 58000234 */ addiu $v0, $zero, 0xAAA", + "/* 72C04 80112C04 09004014 */ bnez $v0, .L80112C2C", + "/* 72BF4 80112BF4 0780023C */ lui $v0, %hi(g_MyStruct_field)", + "/* 72BF8 80112BF8 642F4294 */ lhu $v0, %lo(g_MyStruct_field)($v0)", + "/* 73140 80113140 0800E003 */ jr $ra", + "/* 73144 80113144 00000000 */ nop", + ] + status, _ = get_non_matching_symbols(asm_sample_reference, asm_sample_to_cross) + self.assertEqual(status, "fail") + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/tools.mk b/tools/tools.mk new file mode 100644 index 000000000..a62c7ce5b --- /dev/null +++ b/tools/tools.mk @@ -0,0 +1,2 @@ +test: + python3 tools/symbols_test.py \ No newline at end of file