Add multiprocessing to disassembly script (#449)

* Fix naming bug in disasm.py

* Factor out segment disassembly

* Add multiprocessing for disassembly

* Add multiprocessing for symbol discovery
This commit is contained in:
Connor Anderson 2021-11-11 14:36:40 +00:00 committed by GitHub
parent 30ff2e12b0
commit 25c60c6911
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 163 additions and 83 deletions

View File

@ -232,7 +232,7 @@ setup:
## Assembly generation
disasm:
$(RM) -rf asm data
python3 tools/disasm/disasm.py
python3 tools/disasm/disasm.py -j $(N_THREADS)
diff-init: uncompressed
$(RM) -rf expected/

View File

@ -1,7 +1,14 @@
#!/usr/bin/env python3
import ast, math, os, re, struct
import argparse, ast, math, os, re, struct
from mips_isa import *
from multiprocessing import Pool
parser = argparse.ArgumentParser()
parser.add_argument('-j', dest='jobs', type=int, default=1, help='number of processes to run at once')
args = parser.parse_args()
jobs = args.jobs
ASM_OUT = "asm/"
DATA_OUT = "data/"
@ -304,7 +311,52 @@ def format_f32(f_wd):
return f".word 0x{f_wd:08X}"
return f".float {reduce_float(repr(f))}"
def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
def put_symbol(symbols_dict, setname, symbol):
if setname in symbols_dict:
symbols_dict[setname].add(symbol)
else:
symbols_dict[setname] = {symbol}
def put_symbols(symbols_dict, setname, symbols):
if setname in symbols_dict:
symbols_dict[setname].update(symbols)
else:
symbols_dict[setname] = symbols.copy()
def update_symbols_from_dict(symbols_dict):
for setname, symbol in symbols_dict.items():
if setname == "functions":
functions.update(symbol)
elif setname == "data_labels":
data_labels.update(symbol)
elif setname == "branch_labels":
branch_labels.update(symbol)
elif setname == "jtbls":
jtbls.update(symbol)
elif setname == "jtbl_labels":
jtbl_labels.update(symbol)
elif setname == "floats":
floats.update(symbol)
elif setname == "doubles":
doubles.update(symbol)
elif setname == "prospective_strings":
prospective_strings.update(symbol)
elif setname == "strings":
strings.update(symbol)
elif setname == "symbols":
symbols.update(symbol)
elif setname == "constants":
constants.update(symbol)
elif setname == "dwords":
dwords.update(symbol)
elif setname == "files":
files.update(symbol)
def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs, segment_name):
symbols_dict = dict()
print(f"Finding symbols from .text in {segment_name}")
if rodata is not None:
rodata_words = as_word_list(rodata)
@ -312,8 +364,8 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
assert vram % 0x10 == 0
functions.add(vram)
files.add(vram)
put_symbol(symbols_dict, "functions", vram)
put_symbol(symbols_dict, "files", vram)
# lui trackers are saved at branches to be restored when the branch target is reached
saved_lui_trackers = {} # vram: tracker list
@ -356,9 +408,9 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
for region in data_regions:
assert region[1] != 0
functions.add(region[1])
put_symbol(symbols_dict, "functions", region[1])
if region[1] % 0x10 == 0:
files.add(region[1])
put_symbol(symbols_dict, "files", region[1])
if relocs is not None:
"""
@ -376,7 +428,7 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
Relocated jump targets give us functions in this section
"""
assert insn.id == MIPS_INS_JAL , f"R_MIPS_26 applied to {insn.mnemonic} when it should be JAL"
functions.add(insn.target)
put_symbol(symbols_dict, "functions", insn.target)
elif reloc[1] == 5: # R_MIPS_HI16
"""
Relocated %hi gives us %hi values to match with associated %lo
@ -390,8 +442,8 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
"""
assert insn.id == MIPS_INS_ADDIU or insn.id in MIPS_LOAD_STORE_INSNS , f"R_MIPS_HI16 applied to {insn.mnemonic} when it should be ADDIU or a load/store"
symbol_value = (prev_hi << 0x10) + insn.imm
symbols.update({hi_vram : symbol_value})
symbols.update({vram + reloc[2] : symbol_value})
put_symbols(symbols_dict, "symbols", {hi_vram : symbol_value})
put_symbols(symbols_dict, "symbols", {vram + reloc[2] : symbol_value})
else:
assert False , "Invalid relocation type encountered"
@ -430,12 +482,12 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
if in_data:
continue
if vaddr in functions:
if vaddr in functions or vaddr in symbols_dict["functions"]:
# add func branch labels to all branch labels
branch_labels.update(func_branch_labels)
put_symbols(symbols_dict, "branch_labels", func_branch_labels)
func_branch_labels.clear()
# add func jtbl labels to all jtbl labels
jtbl_labels.update(func_jtbl_labels)
put_symbols(symbols_dict, "jtbl_labels", func_jtbl_labels)
func_jtbl_labels.clear()
# clear individual jtbl labels
individual_jtbl_labels.clear()
@ -455,11 +507,11 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
func_branch_labels.add(insn.offset)
delayed_insn = insn
elif insn.id == MIPS_INS_ERET:
functions.add(vaddr + 4)
put_symbol(symbols_dict, "functions", vaddr + 4)
elif insn.id in MIPS_JUMP_INSNS:
if insn.id == MIPS_INS_JAL:
# mark function at target
functions.add(insn.target)
put_symbol(symbols_dict, "functions", insn.target)
elif insn.id == MIPS_INS_J:
# mark label at target
func_branch_labels.add(insn.target)
@ -478,8 +530,8 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
if not end:
if n_padding > 0:
assert (vaddr + 8 + n_padding * 4) % 0x10 == 0 , f"padding to non-0x10 alignment?, 0x{vaddr + 8 + n_padding * 4:08X}"
files.add(vaddr + 8 + n_padding * 4)
functions.add(vaddr + 8 + n_padding * 4)
put_symbol(symbols_dict, "functions", vaddr + 8 + n_padding * 4)
put_symbol(symbols_dict, "functions", vaddr + 8 + n_padding * 4)
delayed_insn = insn
elif insn.id == MIPS_INS_LUI:
"""
@ -503,8 +555,8 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
(imm_value >= 0xA400 and imm_value < 0xA480) or (imm_value < 0x0400 and insn.id == MIPS_INS_ADDIU)):
lo_vram = vaddr
symbol_value = (imm_value << 0x10) + insn.imm
symbols.update({hi_vram : symbol_value})
symbols.update({lo_vram : symbol_value})
put_symbols(symbols_dict, "symbols", {hi_vram : symbol_value})
put_symbols(symbols_dict, "symbols", {lo_vram : symbol_value})
if insn.id == MIPS_INS_LW:
# try find jr within the same block
cur_idx = i//4
@ -530,7 +582,7 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
break
label = rodata_words[offset]
jtbls.add(symbol_value)
put_symbol(symbols_dict, "jtbls", symbol_value)
# found a jr that matches, no need to keep searching
break
elif (insn.ft if insn.id in MIPS_FP_LOAD_STORE_INSNS else insn.rt) in [lookahead_insn.value_forname(field) for field in lookahead_insn.fields[1:]]:
@ -542,16 +594,16 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
break
lookahead_insn = decode_insn(raw_insns[cur_idx], vram + cur_idx * 4) # TODO fix vaddr here
elif insn.id == MIPS_INS_LD: # doubleword loads
dwords.add(symbol_value)
put_symbol(symbols_dict, "dwords", symbol_value)
elif insn.id in [MIPS_INS_LWC1, MIPS_INS_SWC1]: # float load/stores
# add float
floats.add(symbol_value)
put_symbol(symbols_dict, "floats", symbol_value)
elif insn.id in [MIPS_INS_LDC1, MIPS_INS_SDC1]: # double load/stores
# add double
doubles.add(symbol_value)
put_symbol(symbols_dict, "doubles", symbol_value)
elif insn.id == MIPS_INS_ADDIU and vaddr % 4 == 0: # strings seem to only ever be 4-byte aligned
# add possible string
prospective_strings.add(symbol_value)
put_symbol(symbols_dict, "prospective_strings", symbol_value)
# clear lui tracking state if register is clobbered by the addiu/load instruction itself
# insn.rt == (insn.rs if insn.id == MIPS_INS_ADDIU else insn.base) and
if insn.id not in MIPS_STORE_INSNS and insn.id not in MIPS_FP_LOAD_INSNS:
@ -562,8 +614,8 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
if hi_vram != None: # found match
lo_vram = vaddr
const_value = (imm_value << 0x10) | insn.imm
constants.update({hi_vram : const_value})
constants.update({lo_vram : const_value})
put_symbols(symbols_dict, "constants", {hi_vram : const_value})
put_symbols(symbols_dict, "constants", {lo_vram : const_value})
# clear lui tracking state if register is clobbered by the ori instruction itself
# insn.rt == insn.rs or
if insn.rt in lui_tracker.keys():
@ -613,16 +665,21 @@ def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs):
delay_slot = delayed_insn is not None
branch_labels.update(func_branch_labels)
jtbl_labels.update(func_jtbl_labels)
put_symbols(symbols_dict, "branch_labels", func_branch_labels)
put_symbols(symbols_dict, "jtbl_labels", func_jtbl_labels)
def find_symbols_in_data(data, vram, end, relocs):
return symbols_dict
def find_symbols_in_data(data, vram, end, relocs, segment_name):
symbols_dict = dict()
print(f"Finding symbols from .data in {segment_name}")
# read relocations for symbols
if relocs is not None:
for reloc in relocs:
if reloc[1] == 2: # R_MIPS_32
data_labels.add(as_word(data[reloc[2] : reloc[2] + 4]))
put_symbol(symbols_dict, "data_labels", as_word(data[reloc[2] : reloc[2] + 4]))
elif reloc[1] == 4: # R_MIPS_26
assert False , "R_MIPS_26 in .data section?"
elif reloc[1] == 5: # R_MIPS_HI16
@ -632,6 +689,7 @@ def find_symbols_in_data(data, vram, end, relocs):
else:
assert False , "Invalid relocation type encountered"
return symbols_dict
# TODO more
# matches several codepoint regions of EUC-JP
@ -753,7 +811,7 @@ def disassemble_text(data, vram, data_regions, segment):
if vaddr in segment[4].keys():
if cur_file != "":
os.makedirs(f"{ASM_OUT}/{segment_dirname}/", exist_ok=True)
with open(f"{ASM_OUT}/{segment_dirname}/{cur_file}.{section[2]}.s", "w") as outfile:
with open(f"{ASM_OUT}/{segment_dirname}/{cur_file}.text.s", "w") as outfile:
outfile.write(result)
result = asm_header(".text")
cur_file = segment[4][vaddr]
@ -832,7 +890,7 @@ def disassemble_text(data, vram, data_regions, segment):
result += f"/* {i:06X} {vaddr:08X} {raw_insn:08X} */ {mnemonic:12}{op_str}\n"
os.makedirs(f"{ASM_OUT}/{segment_dirname}/", exist_ok=True)
with open(f"{ASM_OUT}/{segment_dirname}/{cur_file}.{section[2]}.s", "w") as outfile:
with open(f"{ASM_OUT}/{segment_dirname}/{cur_file}.text.s", "w") as outfile:
outfile.write(result)
def disassemble_data(data, vram, end, segment):
@ -1093,12 +1151,11 @@ for segment in files_spec:
print(f"Finding Symbols")
# Find symbols for each segment
for segment in files_spec:
if segment[2] == 'makerom':
continue
print(f"Finding symbols in {segment[0]}")
print(f"Finding segment positions in {segment[0]}")
# vram segment start
if segment[3][0][0] not in variables_ast:
@ -1107,6 +1164,14 @@ for segment in files_spec:
if segment[3][-1][1] not in variables_ast:
variables_ast.update({segment[3][-1][1] : (f"_{segment[0]}SegmentEnd","u8","[]",0x1)})
pool = Pool(jobs)
# Find symbols for each segment
for segment in files_spec:
if segment[2] == 'makerom':
continue
#print(f"Finding symbols from .text and .data in {segment[0]}")
for section in segment[3]:
if section[2] == 'text':
data_regions = []
@ -1121,11 +1186,22 @@ for segment in files_spec:
break
else:
rodata_section = None
find_symbols_in_text(section[4], rodata_section[4] if rodata_section is not None else None,
section[0], rodata_section[0] if rodata_section is not None else None, data_regions, section[5])
pool.apply_async(find_symbols_in_text, args=(section[4], rodata_section[4] if rodata_section is not None else None,
section[0], rodata_section[0] if rodata_section is not None else None, data_regions, section[5], segment[0]), callback=update_symbols_from_dict)
elif section[2] == 'data':
find_symbols_in_data(section[4], section[0], section[1], section[5])
elif section[2] == 'rodata':
pool.apply_async(find_symbols_in_data, args=(section[4], section[0], section[1], section[5], segment[0]), callback=update_symbols_from_dict)
pool.close()
pool.join()
for segment in files_spec:
if segment[2] == 'makerom':
continue
print(f"Finding symbols from .rodata and .dmadata in {segment[0]}")
for section in segment[3]:
if section[2] == 'rodata':
find_symbols_in_rodata(section[4], section[0], section[1], section[5], segment)
elif section[2] == 'dmadata':
filenames = []
@ -1143,22 +1219,18 @@ for segment in files_spec:
i += 1
dmadata_entry = dmadata[i*0x10:(i+1)*0x10]
print("Disassembling Segments")
def disassemble_makerom(segment):
os.makedirs(f"{ASM_OUT}/makerom/", exist_ok=True)
rom_header = segment[3][0][4]
ipl3 = segment[3][1][4]
entry = segment[3][2][4]
# Textual disassembly for each segment
for segment in files_spec:
if segment[2] == 'makerom':
os.makedirs(f"{ASM_OUT}/makerom/", exist_ok=True)
rom_header = segment[3][0][4]
ipl3 = segment[3][1][4]
entry = segment[3][2][4]
pi_dom1_reg, clockrate, entrypoint, revision, \
chksum1, chksum2, pad1, pad2, \
rom_name, pad3, cart, cart_id, \
region, version = struct.unpack(">IIIIIIII20sII2s1sB", rom_header)
pi_dom1_reg, clockrate, entrypoint, revision, \
chksum1, chksum2, pad1, pad2, \
rom_name, pad3, cart, cart_id, \
region, version = struct.unpack(">IIIIIIII20sII2s1sB", rom_header)
out = f"""/*
out = f"""/*
* The Legend of Zelda: Majora's Mask ROM header
*/
@ -1177,47 +1249,47 @@ for segment in files_spec:
.ascii "{region.decode('ascii')}" /* Region */
.byte 0x{version:02X} /* Version */
"""
with open(ASM_OUT + "/makerom/rom_header.s", "w") as outfile:
outfile.write(out)
with open(ASM_OUT + "/makerom/rom_header.s", "w") as outfile:
outfile.write(out)
# TODO disassemble this eventually, low priority
out = f"{asm_header('.text')}\n.incbin \"baserom/makerom\", 0x40, 0xFC0\n"
# TODO disassemble this eventually, low priority
out = f"{asm_header('.text')}\n.incbin \"baserom/makerom\", 0x40, 0xFC0\n"
with open(ASM_OUT + "/makerom/ipl3.s", "w") as outfile:
outfile.write(out)
with open(ASM_OUT + "/makerom/ipl3.s", "w") as outfile:
outfile.write(out)
# hack: add symbol relocations manually
entry_addr = 0x80080000
# hack: add symbol relocations manually
entry_addr = 0x80080000
functions.add(entry_addr)
symbols.update({entry_addr + 0x00 : 0x80099500,
entry_addr + 0x04 : 0x80099500,
entry_addr + 0x20 : 0x80080060,
entry_addr + 0x24 : 0x80099EF0,
entry_addr + 0x28 : 0x80080060,
entry_addr + 0x30 : 0x80099EF0})
branch_labels.add(entry_addr + 0xC)
functions.add(entry_addr)
symbols.update({entry_addr + 0x00 : 0x80099500,
entry_addr + 0x04 : 0x80099500,
entry_addr + 0x20 : 0x80080060,
entry_addr + 0x24 : 0x80099EF0,
entry_addr + 0x28 : 0x80080060,
entry_addr + 0x30 : 0x80099EF0})
branch_labels.add(entry_addr + 0xC)
disassemble_text(entry, entry_addr, [], segment)
disassemble_text(entry, entry_addr, [], segment)
# manually set boot bss size...
entry_asm = ""
with open(f"{ASM_OUT}/makerom/entry.reloc.s") as infile: # TODO why is this written out as .reloc.s
entry_asm = infile.read()
# manually set boot bss size...
entry_asm = ""
with open(f"{ASM_OUT}/makerom/entry.text.s") as infile:
entry_asm = infile.read()
entry_asm = entry_asm.replace("0x63b0", "%lo(_bootSegmentBssSize)")
with open(f"{ASM_OUT}/makerom/entry.s", "w") as outfile:
outfile.write(entry_asm)
entry_asm = entry_asm.replace("0x63b0", "%lo(_bootSegmentBssSize)")
with open(f"{ASM_OUT}/makerom/entry.s", "w") as outfile:
outfile.write(entry_asm)
os.remove(f"{ASM_OUT}/makerom/entry.reloc.s")
os.remove(f"{ASM_OUT}/makerom/entry.text.s")
#out = f"{asm_header('.text')}\n.incbin \"baserom/makerom\", 0x1000, 0x60\n"
#out = f"{asm_header('.text')}\n.incbin \"baserom/makerom\", 0x1000, 0x60\n"
#with open(ASM_OUT + "/makerom/entry.s", "w") as outfile:
# outfile.write(out)
#with open(ASM_OUT + "/makerom/entry.s", "w") as outfile:
# outfile.write(out)
continue
elif segment[2] == 'dmadata':
def disassemble_segment(segment):
if segment[2] == 'dmadata':
os.makedirs(f"{ASM_OUT}/dmadata/", exist_ok=True)
out = f""".include "macro.inc"
@ -1262,7 +1334,7 @@ glabel {variables_ast[0x8009F8B0][0]}
"""
with open(ASM_OUT + "/dmadata/dmadata.s", "w") as outfile:
outfile.write(out)
continue
return
for section in segment[3]:
if (section[0] == section[1] and section[2] != 'reloc') or (section[2] != 'bss' and len(section[4]) == 0):
@ -1299,6 +1371,14 @@ glabel {variables_ast[0x8009F8B0][0]}
with open(f"{DATA_OUT}/{segment_dirname}/{segment[0]}.reloc.s", "w") as outfile:
outfile.write(result)
print("Disassembling Segments")
disassemble_makerom(next(segment for segment in files_spec if segment[2] == 'makerom'))
# Textual disassembly for each segment
with Pool(jobs) as p:
p.map(disassemble_segment, (segment for segment in files_spec if segment[2] != 'makerom'))
print("Splitting text and migrating rodata")
func_regex = re.compile(r'\n\nglabel \S+\n')