diff --git a/.github/workflows/validate-and-report.yml b/.github/workflows/validate-and-report.yml index 04ddb21c4..8a9fb8b7f 100644 --- a/.github/workflows/validate-and-report.yml +++ b/.github/workflows/validate-and-report.yml @@ -72,7 +72,7 @@ jobs: run: make check - name: Analyze calls dry run if: matrix.version == 'us' - run: make force_extract && ./tools/analyze_calls.py ALL_DRY + run: make force_extract && ./tools/analyze_calls.py --ultradry - name: Remove clutter from build folder run: rm -rf build/$(VERSION)/asm build/$(VERSION)/src build/$(VERSION)/assets - name: Export build folder @@ -181,7 +181,7 @@ jobs: - name: Generate function calls chart run: | make force_extract - python3 tools/analyze_calls.py ALL + python3 tools/analyze_calls.py git clean -fdx asm/ - name: Generate duplicates and function report run: | diff --git a/tools/analyze_calls.py b/tools/analyze_calls.py index 88fb7465e..a94d64ea7 100755 --- a/tools/analyze_calls.py +++ b/tools/analyze_calls.py @@ -1,40 +1,53 @@ #!/usr/bin/python3 -# Very closely related to analyze_calls.py (and uses some of the same internal Python functions). -# USAGE: Invoke with no arguments to start interactive graphical mode. User will be prompted for -# functions one at a time, and a graph will be made for each one they give. -# Invoke with one argument, a function name, for command line mode. Callers and callees will be printed. -# Invoke with one argument, ALL, to automatically generate call trees for every function in the game, -# as well as an index.html to connect them all. This is meant to be hosted on the web. +# Performs analysis of every function in the asm directory. For each function, finds +# all functions called by this one, and all functions that call this one. +# By default, this will create several thousand SVG files. +# Run with --dry to generate data for all files as testrun, without outputting files. +# Run with --ultradry to just do the function analysis, but skip generating SVG. -# Credit to xeeynamo for creation of analyze_calls, and sonicdcer for suggestion of command line mode. +# This will print the time taken for each step of the process, which is helpful for +# benchmarking, as well as determining if execution is taking longer as the repo grows. +# On my machine, all steps prior to SVG generation take less than 1 second, and SVG +# generation takes 30 seconds (regardless of dry mode). Note times printed are cumulative +# in the sense that the printed time is not per-step, but is time since execution start. -# On first run, this will generate a call tree, traversing every function and identifying its calls. -# This gets saved to sotn_calltree.txt. To regenerate this tree, simply delete the file. - -# all just for drawing the graph -from functools import partial -import multiprocessing -import graphviz -from PIL import Image -import io - -# regex matching, file access -import re -import os from pathlib import Path -import sys +import os +import re +import graphviz +import time +import multiprocessing +import argparse + +parser = argparse.ArgumentParser(description="Generate call diagram SVGs") +parser.add_argument( + "--dry", + action="store_true", + help="Perform a dry run. Generate SVGs, but do not output to file", +) +parser.add_argument( + "--ultradry", + action="store_true", + help="Perform a dry run. Calculate call hierarchy, but skip generating SVGs", +) output_dir = "function_calls" +# All functions I've found that are used in a 'jalr' instruction in the game callable_registers = ["$v0", "$v1", "$a0", "$a1", "$t2"] +# Handles drawing in the blobs +graph_colors = {"N/A": "lightblue", "True": "green", "False": "red"} -def handle_jal_call(full_file, call_index, known_func_list): +# Important function. For a given line of assembly including a 'jal' or 'jalr' instruction, +# Make an attempt to identify the function being called. For 'jal' this is usually easy. +# For 'jalr' it tends to require more processing. +def handle_jal_call(full_file, call_index): call_line = full_file[call_index] call_target = call_line.split(" ")[-1].strip() - if call_target in known_func_list: # easy, just a direct function call by name + if call_target in function_lookup: # easy, just a direct function call by name return call_target - # Check if we're calling based on a register's value + # Check if we're calling based on a register's value. if call_target in callable_registers: # Find what line sets that register. searchback = 1 @@ -48,17 +61,25 @@ def handle_jal_call(full_file, call_index, known_func_list): if "g_api" in callreg_setline: # regex that will pull out the function being called match = re.search(r"%lo\(([^)]+)\)", callreg_setline) - if match.group(1) in known_func_list: + # Handle members of the g_api Overlay object + if "g_api_o_" in match.group(1): + return match.group(1).replace("_", ".") + # Handle the first element of it, where the symbol gets truncated to just g_api + if "g_api" == match.group(1): + return "g_api.o.Update" + # Handle functions such as AllocPrimitives that are referenced indirectly through g_api + if match.group(1).replace("g_api_", "") in function_lookup: + return match.group(1).replace("g_api_", "") + # Handle remaining symbols that are just straight in GameApi + if match.group(1) in function_lookup: return match.group(1) - if match.group(1) == "g_api": - return "g_api_o_Update" if ( f"lw {call_target}, %lo(D_" in callreg_setline ): # Simply jumping to what's stored in a D_ variable jump_variable = callreg_setline[51:61] return jump_variable if "0x28($s0)" in callreg_setline or "-0xC($s0)" in callreg_setline: - return "pfnEntityUpdate" + return "UnknownpfnEntityUpdate" # happens in NZ0/func_801C1034. v0 is set by dereferencing a register. target_setter_pattern = r"lw\s+" + "\\" + call_target + r", 0x.{,2}\((\$\w+)" if match := re.search(target_setter_pattern, callreg_setline): @@ -101,47 +122,7 @@ def handle_jal_call(full_file, call_index, known_func_list): exit(1) -def get_g_api_funcs(): - funclist = [] - curr_struct_lines = [] - with open("include/game.h") as f: - lines = f.readlines() - for line in lines: - if "typedef struct" in line: - curr_struct_lines = [line] - elif len( - curr_struct_lines - ): # We're in a struct and actively have added lines to it. This one is next. - curr_struct_lines.append(line) - if line[0] == "}" and len( - curr_struct_lines - ): # We're at the end of the struct and it's all loaded into curr_struct_lines. - structname = line.split(";")[0][2:] # Grab the name - # If this is the struct we're looking for, process it. Otherwise forget the struct. - if structname == "GameApi": - for line in curr_struct_lines: - match = re.search( - r"^\s*\/\* ([a-fA-F0-9]+) \*\/\s+[^\s]+\s+\(?\*?([^\s\;\)]+)", - line, - ) - if match: - function_name = match.group(2) - funclist.append("g_api_" + function_name) - if structname == "Overlay": - for line in curr_struct_lines: - match = re.search( - r"^\s*\/\* ([a-fA-F0-9]+) \*\/\s+[^\s]+\s+\(?\*?([^\s\;\)]+)", - line, - ) - if match: - function_name = match.group(2) - funclist.append("g_api_o_" + function_name) - curr_struct_lines = ( - [] - ) # Empty it so we go back to just piping lines to output file - return funclist - - +# SDK functions in include/psxsdk/{whatever}.h are not in the decomp, but may be called. def get_sdk_funcs(): functions = [] # weird thing where setjmp is commented out in libc.h; we add it manually here for now. @@ -158,6 +139,7 @@ def get_sdk_funcs(): return functions +# Many functions in main are not being splatted out yet, so we add them here, like SDK. def get_main_funcs(): with open(f"config/symbols.us.txt") as f: symbols = f.readlines() @@ -171,113 +153,20 @@ def get_main_funcs(): return [line.split(" = ")[0] for line in symbols] -def get_all_funcnames(): - api = get_g_api_funcs() - sdk = get_sdk_funcs() - cfuncs = [s.stem for s in Path("asm").rglob("*.s") if "nonmatchings" in str(s)] - # files in src/main are from sdk and remain as asm, not broken out into function files - mainfuncs = get_main_funcs() - return api + sdk + cfuncs + mainfuncs - - -def build_call_tree(): - tree_dict = {} - all_func_names = get_all_funcnames() - tree_dict["IGNORE_FUNCS"] = ",".join( - get_sdk_funcs() + get_g_api_funcs() + get_main_funcs() - ) - print("Functions loaded.") - print(f"Function count: {len(all_func_names)}") - print("Building call trees...") - for path in Path("asm").rglob("*.s"): - f = str(path) - if "mad" in f: # Skip mad for now, it has weird symbols - continue - if not "nonmatchings" in f or "psxsdk" in f: - continue - overlay = path.parents[2].name - with open(f) as opened_f: - filelines = opened_f.read().split("\n") - foundfuncs = {} - for i, line in enumerate(filelines): - if "jal" in line: - funcname = handle_jal_call(filelines, i, all_func_names) - if funcname not in foundfuncs: - foundfuncs[funcname] = 1 - else: - foundfuncs[funcname] += 1 - tree_dict[path.stem] = ",".join( - [overlay] + [f"{func}-{count}" for func, count in foundfuncs.items()] - ) - return tree_dict - - -def get_all_c_files(src_dir): - c_files_list = list() - for root, dirs, files in os.walk(src_dir): - for f in files: - if f.endswith(".c"): - c_files_list.append(os.path.join(root, f)) - return c_files_list - - -class NonMatchingFunc(object): - def __init__(self, nonmatching_path): - split = nonmatching_path.split("/") - - self.asm_path = nonmatching_path - self.name = os.path.splitext(os.path.basename(nonmatching_path))[0] - nm_index = split.index("nonmatchings") - self.overlay_name = split[nm_index - 1] - self.text_offset = "/".join(split[nm_index + 1 : -1]) - assumed_path = f"/{self.overlay_name}/{self.text_offset}.c" - c_paths = [src for src in src_files if src.endswith(assumed_path)] - if len(c_paths) != 1: - print("Error getting cpath") - print(c_paths) - print(nonmatching_path) - print(assumed_path) - print(split) - exit(2) - self.src_path = c_paths[0] - - -def get_nonmatching_functions(base_path, func_name, overlay=None) -> list: - function_list = list() - for root, dirs, files in os.walk(base_path): - if "/nonmatchings/" in root: - for f in files: - if f == f"{func_name}.s": - full_path = os.path.join(root, f) - function = NonMatchingFunc(full_path) - function_list.append(function) - if len(function_list) > 1: - overlay_matches = [] - for potential_func in function_list: - if f"/{overlay}/" in potential_func.asm_path: - overlay_matches.append(potential_func) - if len(overlay_matches) > 1: - print(f"Multiple matches, failed to whittle") - print(base_path) - print(func_name) - print(overlay) - print([x.asm_path for x in overlay_matches]) - exit(3) - elif len(overlay_matches) == 0: - print(f"Multiple matches but none match overlay {overlay}") - exit(4) - elif len(function_list) == 0: - print("Function not found") - print(func_name) - exit(5) - return function_list[0] - - -def get_function_name(code): - # match a function with a return type, name, parameters, and opening curly brace - pattern = r"\w+\s+(\w+)\([^)]*\)\s*{" - match = re.match(pattern, code) - return match.group(1) if match is not None else None +# Functions in gameapi are often strange, especially in the Overlay member. +def get_gapi_funcs(): + found_functions = [] + # Load up symbols for the relative functions loaded into GameApi + with open("config/symbols.us.txt") as f: + symbols = f.readlines() + for symbol in symbols: + symbolname = symbol.split(" = ")[0] + if "g_api_" in symbolname: + found_functions.append(symbolname) + found_functions.append(symbolname.replace("_", ".")) + # Special case, first element of struct doesn't have a dedicated symbol. + found_functions.append("g_api.o.Update") + return found_functions def is_decompiled(srcfile, fname): @@ -289,173 +178,234 @@ def is_decompiled(srcfile, fname): return True -def analyze_function(fname, tree): - overlay = tree[fname][0].split(";")[0] - foundfunc = get_nonmatching_functions("asm", fname, overlay) - decomp_done = str(is_decompiled(foundfunc.src_path, fname)) - if "GRAPHICAL" in MODE: - graph = graphviz.Digraph(fname) - graph_colors = {"N/A": "lightblue", "True": "green", "False": "red"} - graph.node( - f"{overlay}/{fname}", style="filled", fillcolor=graph_colors[decomp_done] - ) - if MODE == "CMDLINE": - print(f"Analyzing {fname}; Decompiled: {decomp_done}") - print(f"Functions called:") - # Look through our asm file, and see who else we call. - if len(tree[fname]) < 2 and "GRAPHICAL_ALL" not in MODE: - print("No functions called.") - else: - overlay = tree[fname][0] - for item in tree[fname][1:]: - func, count = item.split("-") - if ( - func in tree["IGNORE_FUNCS"] - or func - in ["pfnEntityUpdate", "UnknownSDKFunction", "UnknownEntityFunction"] - or "D_" in func - ): - decomp_done = "N/A" - else: - function_object = get_nonmatching_functions("asm", func, overlay) - decomp_done = str(is_decompiled(function_object.src_path, func)) - if "GRAPHICAL" in MODE: - graph.node( - f"{overlay}/{func}", - style="filled", - fillcolor=graph_colors[decomp_done], - href=func + ".svg", - ) - graph.edge(f"{overlay}/{fname}", f"{overlay}/{func}", count) - if MODE == "CMDLINE": - print(f"{func} called {count} times; Decompiled: {decomp_done}") - # The opposite, find who calls us - if MODE == "CMDLINE": - print(f"\nFunctions which call this:") - for key, value in tree.items(): - overlay = value[0] - callees = value[1:] - if key == "IGNORE_FUNCS": - continue - if any(callee.startswith(f"{fname}-") for callee in callees): - callee_dict = {a: b for a, b in (x.split("-") for x in callees)} - if fname not in callee_dict: - if ("g_api_" + fname) in callee_dict: - fname = "g_api_" + fname +def get_c_filename(asm_filename): + assert "asm/us" in asm_filename and "/nonmatchings/" in asm_filename + # Step 1: Replace asm/us for src + srcpath = asm_filename.replace("asm/us", "src") + # Step 2: Remove the nonmatchings + no_nonmatchings = srcpath.replace("/nonmatchings/", "/") + # Little known rpartition drops the function name to get the last directory, which should be c file name. + c_filename = no_nonmatchings.rpartition("/")[0] + ".c" + assert os.path.exists(c_filename) + return c_filename + + +# Given a function name called from a caller, find which of a set of candidates is probably the function being called. +def find_func_match(caller, candidates): + prefix_lengths = [ + len(os.path.commonprefix([caller.c_filename, candidate.c_filename])) + for candidate in candidates + ] + # Assert that there is a single highest max (and not a tie) + assert prefix_lengths.count(max(prefix_lengths)) + best_match = candidates[prefix_lengths.index(max(prefix_lengths))] + return best_match + + +# Given a sotn_function object, go through and find all its function calls. +# Any function called should be added as a callee for the input_function, and +# the input_function should be added as a caller for the called function. +def analyze(input_function): + with open(input_function.asm_filename) as f: + asm_lines = f.read().split("\n") + for i, line in enumerate(asm_lines): + if "jal" in line: + callee_name = handle_jal_call(asm_lines, i) + + if callee_name.startswith("D_"): + input_function.add_callee(fake_function(callee_name)) + continue + if callee_name.startswith("Unknown"): + input_function.add_callee(fake_function(callee_name)) + continue + candidate_callees = function_lookup[callee_name] + assert len(candidate_callees) > 0 + # It's one of the special cases that we establish + if isinstance(candidate_callees, str): + input_function.add_callee(fake_function(callee_name)) + continue + # If there is only one candidate (only one function in the codebase by that name), we have it. + if len(candidate_callees) == 1: + callee = candidate_callees[0] + input_function.add_callee(callee) + callee.add_caller(input_function) else: - print(f"Function {fname} is not in {callee_dict}.") - print(key) - exit(6) - call_count = callee_dict[fname] - key_as_func = get_nonmatching_functions("asm", key, overlay) - decomp_done = str(is_decompiled(key_as_func.src_path, key)) - if "GRAPHICAL" in MODE: - graph.node( - f"{overlay}/{key}", - style="filled", - fillcolor=graph_colors[decomp_done], - href=key + ".svg", - ) - graph.edge(f"{overlay}/{key}", f"{overlay}/{fname}", call_count) - if MODE == "CMDLINE": - print(f"Called by {key} {call_count} times; Decompiled: {decomp_done}") - # Display the graph in a window on the screen - if MODE == "GRAPHICAL_SINGLE": - imgbytes = graph.pipe(format="png") - img = Image.open(io.BytesIO(imgbytes)) - img.show() - # Save graphs to files - if MODE == "GRAPHICAL_ALL": - filename = f"{output_dir}/{fname}.svg" + best_match = find_func_match(input_function, candidate_callees) + input_function.add_callee(best_match) + best_match.add_caller(input_function) + + +# Create an index.html which lists all functions with their overlays +def generate_html(function_list): + # Sort all functions into overlays, with the name as tiebreaker to sort within overlays + sorted_funcs = sorted( + [f for f in function_list if f.overlay != "mad"], + key=lambda x: (x.overlay, x.name), + ) + html = '
' + active_overlay = "" + # Now iterate through all functions, creating links to their SVG files. + for f in sorted_funcs: + # When the overlay changes, add a heading. + if f.overlay != active_overlay: + # End the previous overlay's list, unless this is the first overlay. + if active_overlay != "": + html += "" + active_overlay = f.overlay + html += f"