2023-05-06 01:09:18 +00:00
#!/usr/bin/env python3
import argparse
2023-09-24 16:55:33 +00:00
import mapfile_parser
2023-05-06 01:09:18 +00:00
import os
2023-09-24 16:55:33 +00:00
from pathlib import Path
2023-07-31 19:14:57 +00:00
import re
import sys
2023-09-16 09:40:40 +00:00
import yaml
Extract and document all functions from main.exe (#736)
This took me a while. I used a [throw-away
script](https://gist.github.com/Xeeynamo/58da1ff8f3831d0ba5d23da27cbca025)
to help me with the quest, but I still had to manually check every
single YAML subsegment.
I transported over what we were able to decompile from main. For a few
functions I added the signatures and documented existing DRA code,
especially on the sound department (got inspired by the recent @sozud
PRs).
All the `.text` part from `main.exe` is now completely extracted as
C/ASM where appropiate. Almost all the functions have their original
PSY-Q names but 7 of them: `func_80012DBC`, `func_80012F84`,
`func_80017008`, `func_80017078`, `func_8001929C`, `func_80021F0C` and
`func_800286E0`. I did not feel confident enough to rename them, so I
left them be. The rest of the functions I am 99% sure they are all
accurate.
I am now excluding the
[asm/](https://github.com/Xeeynamo/sotn-decomp/tree/master/asm/us/main/psxsdk)
folder from the repo. It was useless.
This research confirms me the game uses the PSY-Q 3.5 libraries, with
the exception of `LIBGPU.LIB`, which is from PSY-Q 3.0 for some unknown
reason.
EDIT: `make format` was not taking care of duplicated symbols in our
symbol list. To speed-up my work, all the duplicate symbols (duplicates
= name AND offset) are now removed.
2023-11-06 22:53:27 +00:00
from collections import OrderedDict
2023-05-06 01:09:18 +00:00
2023-07-31 19:14:57 +00:00
parser = argparse . ArgumentParser ( description = " Perform operations on game symbols " )
2023-07-25 17:38:30 +00:00
parser . add_argument ( " --version " , required = False , type = str , help = " Game version " )
subparsers = parser . add_subparsers ( dest = " command " )
2023-05-06 01:09:18 +00:00
2023-07-25 17:38:30 +00:00
sort_parser = subparsers . add_parser (
2023-07-31 19:14:57 +00:00
" sort " , description = " Sort all the symbols of a given GNU LD script by their offset "
)
2023-09-24 16:55:33 +00:00
2023-07-31 19:14:57 +00:00
cross_parser = subparsers . add_parser (
" cross " ,
description = " Cross-reference the symbols between two assembly files and print the result to stdout for GNU LD. Useful to cross-reference symbols between different overlays or game revisions. The assemblies must be identical. " ,
)
cross_parser . add_argument (
" ref " ,
help = " Assembly source file to use as a base reference " ,
)
cross_parser . add_argument (
" to_cross " ,
help = " Assembly source file to be cross-referenced to " ,
2023-07-25 17:38:30 +00:00
)
2023-09-24 16:55:33 +00:00
2023-09-16 09:40:40 +00:00
orphan_parser = subparsers . add_parser (
" remove-orphans " ,
description = " Remove all symbols that are not referenced from a specific group of assembly code " ,
)
orphan_parser . add_argument (
" config_yaml " ,
help = " The Splat YAML config of the overlay to remove the orphan symbols from " ,
)
2023-05-06 01:09:18 +00:00
2023-09-24 16:55:33 +00:00
map_parser = subparsers . add_parser (
" map " ,
description = " Print the list of symbols from a map file " ,
)
map_parser . add_argument (
" map_file_name " ,
help = " The map file to extract the symbols from " ,
)
map_parser . add_argument (
" --no-default " ,
required = False ,
action = " store_true " ,
help = " Do not include Splat default symbols that starts with D_ or func_ " ,
)
2023-05-06 01:09:18 +00:00
args = parser . parse_args ( )
if args . version == None :
2023-07-25 17:38:30 +00:00
args . version = os . getenv ( " VERSION " )
2023-05-06 01:09:18 +00:00
if args . version == None :
2023-07-25 17:38:30 +00:00
args . version = " us "
2023-05-06 01:09:18 +00:00
2023-09-24 16:55:33 +00:00
def is_splat_symbol_name ( name ) :
return (
name . startswith ( " D_ " )
or name . startswith ( " func_ " )
or name . startswith ( " jpt_ " )
or name . startswith ( " jtbl_ " )
)
2023-09-16 09:51:21 +00:00
def add_newline_if_missing ( list ) :
2023-11-19 02:40:14 +00:00
if len ( list ) > 0 :
if not list [ - 1 ] . endswith ( " \n " ) :
list [ - 1 ] + = " \n "
2023-09-16 09:51:21 +00:00
return list
2023-07-31 19:14:57 +00:00
def sort_symbols ( syms ) :
2023-05-06 01:09:18 +00:00
offsets = [ ]
2023-07-31 19:14:57 +00:00
for line in syms :
2024-02-18 14:57:03 +00:00
if line . startswith ( " // " ) :
# ignore comments
continue
2023-07-31 19:14:57 +00:00
parts = line . strip ( ) . split ( )
if len ( parts ) > = 3 :
offset = int ( parts [ 2 ] . rstrip ( " ; " ) , 16 )
offsets . append ( ( line , offset ) )
2023-05-06 01:09:18 +00:00
offsets . sort ( key = lambda x : x [ 1 ] )
Extract and document all functions from main.exe (#736)
This took me a while. I used a [throw-away
script](https://gist.github.com/Xeeynamo/58da1ff8f3831d0ba5d23da27cbca025)
to help me with the quest, but I still had to manually check every
single YAML subsegment.
I transported over what we were able to decompile from main. For a few
functions I added the signatures and documented existing DRA code,
especially on the sound department (got inspired by the recent @sozud
PRs).
All the `.text` part from `main.exe` is now completely extracted as
C/ASM where appropiate. Almost all the functions have their original
PSY-Q names but 7 of them: `func_80012DBC`, `func_80012F84`,
`func_80017008`, `func_80017078`, `func_8001929C`, `func_80021F0C` and
`func_800286E0`. I did not feel confident enough to rename them, so I
left them be. The rest of the functions I am 99% sure they are all
accurate.
I am now excluding the
[asm/](https://github.com/Xeeynamo/sotn-decomp/tree/master/asm/us/main/psxsdk)
folder from the repo. It was useless.
This research confirms me the game uses the PSY-Q 3.5 libraries, with
the exception of `LIBGPU.LIB`, which is from PSY-Q 3.0 for some unknown
reason.
EDIT: `make format` was not taking care of duplicated symbols in our
symbol list. To speed-up my work, all the duplicate symbols (duplicates
= name AND offset) are now removed.
2023-11-06 22:53:27 +00:00
return list ( OrderedDict . fromkeys ( [ line [ 0 ] for line in offsets ] ) )
2023-07-31 19:14:57 +00:00
# rewrite the same file with an ordered symbol list
def sort_symbols_from_file ( symbol_file_name ) :
with open ( symbol_file_name , " r " ) as symbol_file :
sorted_lines = sort_symbols ( symbol_file )
2023-09-16 09:51:21 +00:00
add_newline_if_missing ( sorted_lines )
2023-07-25 17:38:30 +00:00
with open ( symbol_file_name , " w " ) as symbol_file :
2023-07-31 19:14:57 +00:00
symbol_file . writelines ( sorted_lines )
2023-07-25 17:38:30 +00:00
2023-05-06 01:09:18 +00:00
def sort ( base_path ) :
files = os . listdir ( base_path )
# Filter the list to include only files that start with 'symbols.us.' and end with '.txt'
2023-07-25 17:38:30 +00:00
filtered_files = [
f
for f in files
if f . startswith ( f " symbols. { args . version } . " ) and f . endswith ( " .txt " )
]
2023-05-06 01:09:18 +00:00
for symbol_file_name in [ os . path . join ( base_path , f ) for f in filtered_files ] :
2023-07-31 19:14:57 +00:00
sort_symbols_from_file ( symbol_file_name )
# regex helper to match a hexadecimal string without the '0x'
def re_hex ( name ) :
return f " (?P< { name } >[0-9A-F]+) "
# regex helper to parse splat's disassembler /* LOC VRAM VAL */ comments
def re_splat_line ( ) :
return f " / \\ * { re_hex ( ' LOC ' ) } { re_hex ( ' VRAM ' ) } { re_hex ( ' VAL ' ) } \\ */ "
# regex helper to match C-style identifiers
def re_ident ( name ) :
return f " (?P< { name } >[a-zA-Z_][a-zA-Z0-9_]*) "
# regex helper to match assembly registers
def re_reg ( name ) :
2024-06-04 20:07:50 +00:00
return rf " (?P< { name } > \ $[0-9a-z]+) "
2023-07-31 19:14:57 +00:00
# regex helper to match the two %lo and %hi functions
re_func = r " (?P<FUNC> \ % (hi|lo)) "
# all the regex patterns supported by the MIPS assembly parser
patterns = [
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ { re_func } \ ( { re_ident ( ' SYM ' ) } \ ) \ ( { re_reg ( ' IMM ' ) } \ ) " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " FUNC " , " SYM " , " IMM " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ { re_func } \ ( { re_ident ( ' SYM ' ) } \ ) " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " FUNC " , " SYM " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ { re_reg ( ' LEFT ' ) } , \\ s+ { re_reg ( ' RIGHT ' ) } " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " LEFT " , " RIGHT " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ { re_reg ( ' LEFT ' ) } , \\ s+ { re_func } \ ( { re_ident ( ' SYM ' ) } \ ) \ ( { re_reg ( ' IMM ' ) } \ ) " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " LEFT " , " FUNC " , " SYM " , " IMM " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ { re_reg ( ' LEFT ' ) } , \\ s+ { re_func } \ ( { re_ident ( ' SYM ' ) } \ ) " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " LEFT " , " FUNC " , " SYM " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ { re_reg ( ' LEFT ' ) } " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " LEFT " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ \ . { re_ident ( ' LABEL ' ) } " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " LABEL " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } \\ s+ { re_reg ( ' DST ' ) } , \\ s+ \ . { re_ident ( ' LABEL ' ) } " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " , " DST " , " LABEL " ] ,
) ,
(
2024-06-04 20:07:50 +00:00
rf " { re_splat_line ( ) } \\ s+ { re_ident ( ' OP ' ) } $ " ,
2023-07-31 19:14:57 +00:00
[ " LOC " , " VRAM " , " VAL " , " OP " ] ,
) ,
( r " glabel (?P<FUNC_NAME> \ w+) " , [ " FUNC_NAME " ] ) ,
]
# tokenize a single line of MIPS assembly code
def asm_tokenize_line ( line ) :
for pattern , token_names in patterns :
match = re . match ( pattern , line )
if match :
tokens = match . groupdict ( )
return { key : tokens [ key ] for key in token_names }
return None
# get a dictionary of all the non-matching and cross-referenced symbols
def get_non_matching_symbols ( asm_ref , asm_cross ) :
def is_value_equal ( a , b , key ) :
if key not in a and key not in b :
return True
if key not in a or key not in b :
return False
return a [ key ] == b [ key ]
def get_imm_addr ( raw_val ) :
return int ( raw_val [ 2 : 4 ] + raw_val [ 0 : 2 ] , 16 )
def get_hi_addr ( op , raw_val ) :
if op == " lui " :
return get_imm_addr ( raw_val ) << 16
else :
print (
f " CROSS-REFERENCING ERROR: %hi did not expect op ' { op } ' . " ,
file = sys . stderr ,
)
print ( f " affected line: { line_cross } ' . " , file = sys . stderr )
assert op == " lui "
def get_lo_addr ( op , raw_val ) :
imm = get_imm_addr ( raw_val )
if imm < 32767 :
return imm
return imm - 0x10000
ref_line_count = len ( asm_ref )
cross_line_count = len ( asm_cross )
if ref_line_count != cross_line_count :
return " fail " , [ ]
syms = dict ( )
prev_instr_hi = False
cross_off = 0
for i in range ( 0 , ref_line_count ) :
line_ref = asm_ref [ i ]
line_cross = asm_cross [ i ]
if line_ref == line_cross :
continue # if lines are identical, skip and continue
tokens_ref = asm_tokenize_line ( line_ref )
tokens_cross = asm_tokenize_line ( line_cross )
if tokens_ref == tokens_cross :
continue # if tokens are identical, skip and continue
if tokens_ref == None or tokens_cross == None :
return " fail " , [ ] # token mis-match, functions are different
if is_value_equal ( tokens_ref , tokens_cross , " OP " ) == False :
return " fail " , [ ] # if op code is not the same, functions are different
if is_value_equal ( tokens_ref , tokens_cross , " SYM " ) == True :
continue # if a symbol is found and it is the same then continue
if " SYM " not in tokens_ref :
continue # instruction do not use any symbol, skip and continue
# If arriving here it should be the only case where cross-referencing
# between two symbols should happen.
sym = tokens_ref [ " SYM " ]
if sym . startswith ( " jpt_ " ) :
continue # actively ignore jump tables
op = tokens_cross [ " OP " ]
func = tokens_cross [ " FUNC " ]
raw_val = tokens_cross [ " VAL " ]
if prev_instr_hi and func == " %lo " :
prev_instr_hi = False
cross_off + = get_lo_addr ( op , raw_val )
syms [ sym ] = cross_off
elif func == " %hi " :
prev_instr_hi = True
cross_off = get_hi_addr ( op , raw_val )
else :
# Do nothing. There are instances where between a %hi and a %lo
# some instructions can be found.
continue
return " ok " , syms
def cross ( asm_reference_file_name , asm_to_cross_file_name ) :
with open ( asm_reference_file_name , " r " ) as asm_ref_file :
with open ( asm_to_cross_file_name , " r " ) as asm_cross_file :
err , syms = get_non_matching_symbols (
asm_ref_file . readlines ( ) , asm_cross_file . readlines ( )
)
if err != " ok " :
print (
" assemblies too different to be cross-referenced automatically " ,
file = sys . stderr ,
)
return
# print symbol list in GNU LD style
for sym in syms :
print ( f " { sym } = 0x { syms [ sym ] : 08X } ; " )
2023-05-06 01:09:18 +00:00
2023-07-25 17:38:30 +00:00
2023-09-16 09:40:40 +00:00
def get_all_file_paths_recursively ( path ) :
file_list = [ ]
for root , directories , files in os . walk ( path ) :
for file in files :
file_list . append ( os . path . join ( root , file ) )
return file_list
def tokenize_symbols ( file_path ) :
with open ( file_path , " r " ) as f :
content = f . read ( )
content_without_comments = re . sub ( r " / \ *.*? \ */ " , " " , content , flags = re . DOTALL )
content_without_labels = re . sub ( r " \ bL8 \ w* " , " " , content_without_comments )
content_without_strings = re . sub ( r ' " [^ " ]* " ' , " " , content_without_labels )
return re . findall ( r " \ b[a-zA-Z_] \ w* \ b " , content_without_strings )
def remove_orphans ( symbol_file_name , symbols_set ) :
with open ( symbol_file_name , " r " ) as symbol_file_ref :
symbols_defined = symbol_file_ref . readlines ( )
symbols_unorphaned = [ ]
for sym_def in symbols_defined :
2024-02-18 14:57:03 +00:00
if sym_def . startswith ( " // " ) :
# ignore comments
continue
2023-09-16 09:40:40 +00:00
if len ( sym_def ) > 4 and sym_def . find ( " ignore:true " ) == - 1 :
sym_tokenized = sym_def . split ( " = " )
if len ( sym_tokenized ) > = 2 :
sym = sym_tokenized [ 0 ] . strip ( )
if sym not in symbols_set :
continue
symbols_unorphaned . append ( sym_def )
2023-09-16 09:51:21 +00:00
add_newline_if_missing ( symbols_unorphaned )
2023-09-16 09:40:40 +00:00
with open ( symbol_file_name , " w " ) as symbol_file_ref :
symbol_file_ref . writelines ( symbols_unorphaned )
def remove_orphans_from_config ( config_yaml ) :
with open ( config_yaml , " r " ) as config_yaml_ref :
config = yaml . safe_load ( config_yaml_ref )
2024-02-18 14:57:03 +00:00
symbol_addrs_path = config [ " options " ] [ " symbol_addrs_path " ]
if isinstance ( symbol_addrs_path , str ) :
symbol_file_name = symbol_addrs_path
else :
symbol_file_name = symbol_addrs_path [ - 1 ] # take last
2023-09-16 09:40:40 +00:00
asm_path = config [ " options " ] [ " asm_path " ]
file_list = get_all_file_paths_recursively ( asm_path )
asm_file_list = [ file for file in file_list if file . endswith ( " .s " ) ]
if len ( file_list ) == 0 :
print (
f " WARN: No symbols found for ' { symbol_file_name } ' in ' { asm_path } ' . Terminating before making destructive changes. " ,
file = sys . stderr ,
)
exit ( 0 )
symbols_found = set ( )
for asm_file in asm_file_list :
symbols_found . update ( tokenize_symbols ( asm_file ) )
# The following hack forces to also process symbols from the YAML config itself.
# This is because tiledef in ST/WRP uses the symbol list to extract the tile definition.
symbols_found . update ( tokenize_symbols ( config_yaml ) )
remove_orphans ( symbol_file_name , symbols_found )
2023-09-24 16:55:33 +00:00
def print_map_symbols ( map_file_name , no_default ) :
map_file = mapfile_parser . MapFile ( )
map_file . readMapFile ( Path ( map_file_name ) )
filter = (
2024-06-12 19:32:10 +00:00
( lambda name : not is_splat_symbol_name ( name ) )
2023-09-24 16:55:33 +00:00
if no_default
else ( lambda _ : True )
)
syms = dict ( )
for segment in map_file :
for file in segment :
for sym in file :
if sym . vram not in syms and filter ( sym . name ) :
syms [ sym . vram ] = sym . name
for vram in syms :
2024-06-12 19:32:10 +00:00
print ( f " { syms [ vram ] } = 0x { vram : 08X } ; // allow_duplicated:True " )
2023-09-24 16:55:33 +00:00
2023-05-06 01:09:18 +00:00
if __name__ == " __main__ " :
2023-07-25 17:38:30 +00:00
if args . command == " sort " :
2023-05-06 01:09:18 +00:00
sort ( " config/ " )
2023-07-31 19:14:57 +00:00
elif args . command == " cross " :
cross ( args . ref , args . to_cross )
2023-09-16 09:40:40 +00:00
elif args . command == " remove-orphans " :
remove_orphans_from_config ( args . config_yaml )
2023-09-24 16:55:33 +00:00
elif args . command == " map " :
print_map_symbols ( args . map_file_name , args . no_default )