add decompiler

This commit is contained in:
water 2020-08-22 23:30:17 -04:00
parent ba3c3af43e
commit c3aff47886
52 changed files with 10200 additions and 5 deletions

View File

@ -47,7 +47,7 @@ Design:
- Workflow for development:
- `./gc.sh` : run the compiler in interactive mode
- `./gs.sh` : run a goos interpreter in interactive mode
- `./decomp.sh ./iso_data` : run the decompiler
- `./decomp.sh : run the decompiler
Current state:
- GOAL compiler just implements the GOOS Scheme Macro Language. Running `./gc.sh` just loads the GOOS library (`goalc/gs/goos-lib.gs`) and then goes into an interactive mode. Use `(exit)` to exit.
@ -79,7 +79,8 @@ TODOS:
- performance stats for `SystemThread` (probably just get rid of these performance stats completely)
- `mmap`ing executable memory
- line input library (appears windows compatible?)
- Clean up possible duplicate code in compiler/decompiler `util` folder
- Clean up possible duplicate code in compiler/decompiler `util` folder, consider a common utility library
- Clean up header guard names (or just use `#pragma once`?)
- Investigate a better config format
- The current JSON library seems to have issues with comments, which I really like
- Clean up use of namespaces
@ -90,9 +91,9 @@ TODOS:
- Listener protocol document
- GOAL Compiler IR
- GOAL Compiler Skeleton
In Progress:
- GOAL emitter / emitter testing setup
- Gtest setup for checking decompiler results against hand-decompiled stuff
- Clean up decompiler print spam, finish up the CFG stuff
- Decompiler document
Project Description

6
decomp.sh Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
# Directory of this script
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
$DIR/build/decompiler/decompiler $DIR/decompiler/config/jak1_ntsc_black_label.jsonc $DIR/iso_data $DIR/decompiler_out

View File

@ -0,0 +1,25 @@
add_executable(decompiler
util/LispPrint.cpp
main.cpp
ObjectFile/ObjectFileDB.cpp
Disasm/Instruction.cpp
Disasm/InstructionDecode.cpp
Disasm/OpcodeInfo.cpp
Disasm/Register.cpp
ObjectFile/LinkedObjectFileCreation.cpp
ObjectFile/LinkedObjectFile.cpp
Function/Function.cpp
util/FileIO.cpp
config.cpp
util/LispPrint.cpp
util/Timer.cpp
Function/BasicBlocks.cpp
Disasm/InstructionMatching.cpp
TypeSystem/GoalType.cpp
TypeSystem/GoalFunction.cpp
TypeSystem/GoalSymbol.cpp
TypeSystem/TypeInfo.cpp
TypeSystem/TypeSpec.cpp Function/CfgVtx.cpp Function/CfgVtx.h)
target_link_libraries(decompiler
minilzo)

View File

@ -0,0 +1,304 @@
/*!
* @file Instruction.cpp
* An EE instruction, represented as an operation, plus a list of source/destination atoms.
* Can print itself (within the context of a LinkedObjectFile).
*/
#include "Instruction.h"
#include "decompiler/ObjectFile/LinkedObjectFile.h"
#include <cassert>
/*!
* Convert atom to a string for disassembly.
*/
std::string InstructionAtom::to_string(const LinkedObjectFile& file) const {
switch (kind) {
case REGISTER:
return reg.to_string();
case IMM:
return std::to_string(imm);
case LABEL:
return file.get_label_name(label_id);
case VU_ACC:
return "acc";
case VU_Q:
return "Q";
case IMM_SYM:
return sym;
default:
assert(false);
}
}
/*!
* Make this atom a register.
*/
void InstructionAtom::set_reg(Register r) {
kind = REGISTER;
reg = r;
}
/*!
* Make this atom an immediate.
*/
void InstructionAtom::set_imm(int32_t i) {
kind = IMM;
imm = i;
}
/*!
* Make this atom a label.
*/
void InstructionAtom::set_label(int id) {
kind = LABEL;
label_id = id;
}
/*!
* Make this atom the VU ACC register.
*/
void InstructionAtom::set_vu_acc() {
kind = VU_ACC;
}
/*!
* Make this atom the VU0 Q register.
*/
void InstructionAtom::set_vu_q() {
kind = VU_Q;
}
/*!
* Make this atom a symbol.
*/
void InstructionAtom::set_sym(std::string _sym) {
kind = IMM_SYM;
sym = std::move(_sym);
}
/*!
* Get as register, or error if not a register.
*/
Register InstructionAtom::get_reg() const {
assert(kind == REGISTER);
return reg;
}
/*!
* Get as integer immediate, or error if not an integer immediate.
*/
int32_t InstructionAtom::get_imm() const {
assert(kind == IMM);
return imm;
}
/*!
* Get as label index, or error if not a label.
*/
int InstructionAtom::get_label() const {
assert(kind == LABEL);
return label_id;
}
/*!
* Get as symbol, or error if not a symbol.
*/
std::string InstructionAtom::get_sym() const {
assert(kind == IMM_SYM);
return sym;
}
/*!
* True if this atom is some sort of constant that doesn't involve linking.
*/
bool InstructionAtom::is_link_or_label() const {
return kind == IMM_SYM || kind == LABEL;
}
/*!
* Convert entire instruction to a string.
*/
std::string Instruction::to_string(const LinkedObjectFile& file) const {
auto& info = gOpcodeInfo[(int)kind];
// the name
std::string result = info.name;
// optional "interlock" specification.
if (il != 0xff) {
result.append(il ? ".i" : ".ni");
}
// optional "broadcast" specification for COP2 opcodes.
if (cop2_bc != 0xff) {
switch (cop2_bc) {
case 0:
result.push_back('x');
break;
case 1:
result.push_back('y');
break;
case 2:
result.push_back('z');
break;
case 3:
result.push_back('w');
break;
default:
result.push_back('?');
break;
}
}
// optional "destination" specification for COP2 opcodes.
if (cop2_dest != 0xff) {
result += ".";
if (cop2_dest & 8)
result.push_back('x');
if (cop2_dest & 4)
result.push_back('y');
if (cop2_dest & 2)
result.push_back('z');
if (cop2_dest & 1)
result.push_back('w');
}
// relative store and load instructions have a special syntax in MIPS
if (info.is_store) {
assert(n_dst == 0);
assert(n_src == 3);
result += " ";
result += src[0].to_string(file);
result += ", ";
result += src[1].to_string(file);
result += "(";
result += src[2].to_string(file);
result += ")";
} else if (info.is_load) {
assert(n_dst == 1);
assert(n_src == 2);
result += " ";
result += dst[0].to_string(file);
result += ", ";
result += src[0].to_string(file);
result += "(";
result += src[1].to_string(file);
result += ")";
} else {
// for instructions that aren't a store or load, the dest/sources are comma separated.
bool end_comma = false;
for (uint8_t i = 0; i < n_dst; i++) {
result += " " + dst[i].to_string(file) + ",";
end_comma = true;
}
for (uint8_t i = 0; i < n_src; i++) {
result += " " + src[i].to_string(file) + ",";
end_comma = true;
}
if (end_comma) {
result.pop_back();
}
}
return result;
}
/*!
* Was this instruction successfully decoded?
*/
bool Instruction::is_valid() const {
return kind != InstructionKind::UNKNOWN;
}
/*!
* Add a destination atom to this Instruction
*/
void Instruction::add_dst(InstructionAtom& a) {
assert(n_dst < MAX_INTRUCTION_DEST);
dst[n_dst++] = a;
}
/*!
* Add a source atom to this Instruction
*/
void Instruction::add_src(InstructionAtom& a) {
assert(n_src < MAX_INSTRUCTION_SOURCE);
src[n_src++] = a;
}
/*!
* Get a source atom that's an immediate, or error if it doesn't exist.
*/
InstructionAtom& Instruction::get_imm_src() {
for (int i = 0; i < n_src; i++) {
if (src[i].kind == InstructionAtom::IMM) {
return src[i];
}
}
assert(false);
return src[0];
}
/*!
* Try to find a src which is an integer immediate, and return it as an integer.
*/
int32_t Instruction::get_imm_src_int() {
return get_imm_src().get_imm();
}
/*!
* Safe get dst atom
*/
InstructionAtom& Instruction::get_dst(size_t idx) {
assert(idx < n_dst);
return dst[idx];
}
/*!
* Safe get src atom
*/
InstructionAtom& Instruction::get_src(size_t idx) {
assert(idx < n_src);
return src[idx];
}
/*!
* Safe get dst atom
*/
const InstructionAtom& Instruction::get_dst(size_t idx) const {
assert(idx < n_dst);
return dst[idx];
}
/*!
* Safe get src atom
*/
const InstructionAtom& Instruction::get_src(size_t idx) const {
assert(idx < n_src);
return src[idx];
}
/*!
* Get OpcodeInfo for the opcode used in this instruction.
*/
const OpcodeInfo& Instruction::get_info() const {
return gOpcodeInfo[int(kind)];
}
/*!
* Get the target label for this instruction. If the instruction doesn't have a target label,
* return -1.
*/
int Instruction::get_label_target() const {
int result = -1;
for (int i = 0; i < n_src; i++) {
if (src[i].kind == InstructionAtom::AtomKind::LABEL) {
assert(result == -1);
result = src[i].get_label();
}
}
return result;
}

View File

@ -0,0 +1,89 @@
/*!
* @file Instruction.h
* An EE instruction, represented as an operation, plus a list of source/destination atoms.
* Can print itself (within the context of a LinkedObjectFile).
*/
#ifndef NEXT_INSTRUCTION_H
#define NEXT_INSTRUCTION_H
#include "OpcodeInfo.h"
#include "Register.h"
class LinkedObjectFile;
constexpr int MAX_INSTRUCTION_SOURCE = 3;
constexpr int MAX_INTRUCTION_DEST = 1;
// An "atom", representing a single register, immediate, etc... for use in an Instruction.
struct InstructionAtom {
enum AtomKind {
REGISTER, // An EE Register
IMM, // An immediate value (stored as int32)
IMM_SYM, // An immediate value (a symbolic link)
LABEL, // A label in a LinkedObjectFile
VU_ACC, // The VU0 Accumulator
VU_Q, // The VU0 Q Register
INVALID
} kind = INVALID;
void set_reg(Register r);
void set_imm(int32_t i);
void set_label(int id);
void set_vu_q();
void set_vu_acc();
void set_sym(std::string _sym);
Register get_reg() const;
int32_t get_imm() const;
int get_label() const;
std::string get_sym() const;
std::string to_string(const LinkedObjectFile& file) const;
bool is_link_or_label() const;
private:
int32_t imm;
int label_id;
Register reg;
std::string sym;
};
// An "Instruction", consisting of a "kind" (the opcode), and the source/destination atoms it
// operates on.
class Instruction {
public:
InstructionKind kind = InstructionKind::UNKNOWN;
std::string to_string(const LinkedObjectFile& file) const;
bool is_valid() const;
void add_src(InstructionAtom& a);
void add_dst(InstructionAtom& a);
InstructionAtom& get_src(size_t idx);
InstructionAtom& get_dst(size_t idx);
const InstructionAtom& get_src(size_t idx) const;
const InstructionAtom& get_dst(size_t idx) const;
// source and destination atoms
uint8_t n_src = 0, n_dst = 0;
InstructionAtom src[MAX_INSTRUCTION_SOURCE];
InstructionAtom dst[MAX_INTRUCTION_DEST];
InstructionAtom& get_imm_src();
int32_t get_imm_src_int();
const OpcodeInfo& get_info() const;
int get_label_target() const;
// extra fields for some COP2 instructions.
uint8_t cop2_dest = 0xff; // 0xff indicates "don't print dest"
uint8_t cop2_bc = 0xff; // 0xff indicates "don't print bc"
uint8_t il = 0xff; // 0xff indicates "don't print il"
};
#endif // NEXT_INSTRUCTION_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,17 @@
/*!
* @file InstructionDecode.h
* The Instruction Decoder - converts a LinkedWord into a Instruction.
* This is the part of the disassembler that decodes MIPS instructions.
*/
#ifndef NEXT_INSTRUCTIONDECODE_H
#define NEXT_INSTRUCTIONDECODE_H
#include "Instruction.h"
class LinkedWord;
class LinkedObjectFile;
Instruction decode_instruction(LinkedWord& word, LinkedObjectFile& file, int seg_id, int word_id);
#endif // NEXT_INSTRUCTIONDECODE_H

View File

@ -0,0 +1,350 @@
/*!
* @file InstructionMatching.cpp
* Utilities for checking if an instruction matches some criteria.
*/
#include <cassert>
#include "InstructionMatching.h"
/*!
* Check if the given instruction stores a GPR with the specified parameters.
*/
bool is_no_link_gpr_store(const Instruction& instr,
MatchParam<int> size,
MatchParam<Register> src,
MatchParam<int> offset,
MatchParam<Register> dest) {
// match the opcode
if (!size.is_wildcard) {
switch (size.value) {
case 1:
if (instr.kind != InstructionKind::SB) {
return false;
}
break;
case 2:
if (instr.kind != InstructionKind::SH) {
return false;
}
break;
case 4:
if (instr.kind != InstructionKind::SW) {
return false;
}
break;
case 8:
if (instr.kind != InstructionKind::SD) {
return false;
}
break;
case 16:
if (instr.kind != InstructionKind::SQ) {
return false;
}
break;
default:
assert(false);
}
} else {
// just make sure it's a gpr store
if (!is_gpr_store(instr)) {
return false;
}
}
assert(instr.n_src == 3);
// match other arguments
return src == instr.src[0].get_reg() && offset == instr.src[1].get_imm() &&
dest == instr.src[2].get_reg();
}
/*!
* Check if the given instruction loads a GPR with the specified parameters.
* LD and LQ count as signed, unsigned, and "wildcard signed" loads.
* LWL/LWR/LDL/LDR will never match.
*
* "no ll" means no link or label
*/
bool is_no_ll_gpr_load(const Instruction& instr,
MatchParam<int> size,
MatchParam<bool> is_signed,
MatchParam<Register> dst_reg,
MatchParam<int> offset,
MatchParam<Register> mem_reg) {
// match the opcode
if (!size.is_wildcard) {
if (is_signed.is_wildcard) {
switch (size.value) {
case 1:
if (instr.kind != InstructionKind::LB && instr.kind != InstructionKind::LBU) {
return false;
}
break;
case 2:
if (instr.kind != InstructionKind::LH && instr.kind != InstructionKind::LHU) {
return false;
}
break;
case 4:
if (instr.kind != InstructionKind::LW && instr.kind != InstructionKind::LWU) {
return false;
}
break;
case 8:
if (instr.kind != InstructionKind::LD) {
return false;
}
break;
case 16:
if (instr.kind != InstructionKind::LQ) {
return false;
}
break;
default:
assert(false);
}
} else {
if (is_signed.value) {
switch (size.value) {
case 1:
if (instr.kind != InstructionKind::LB) {
return false;
}
break;
case 2:
if (instr.kind != InstructionKind::LH) {
return false;
}
break;
case 4:
if (instr.kind != InstructionKind::LW) {
return false;
}
break;
case 8:
if (instr.kind != InstructionKind::LD) {
return false;
}
break;
case 16:
if (instr.kind != InstructionKind::LQ) {
return false;
}
break;
default:
assert(false);
}
} else {
switch (size.value) {
case 1:
if (instr.kind != InstructionKind::LBU) {
return false;
}
break;
case 2:
if (instr.kind != InstructionKind::LHU) {
return false;
}
break;
case 4:
if (instr.kind != InstructionKind::LWU) {
return false;
}
break;
case 8:
if (instr.kind != InstructionKind::LD) {
return false;
}
break;
case 16:
if (instr.kind != InstructionKind::LQ) {
return false;
}
break;
default:
assert(false);
}
}
}
} else {
// just make sure it's a gpr store
if (!is_gpr_load(instr, is_signed)) {
return false;
}
}
// match other arguments
return dst_reg == instr.get_dst(0).get_reg() && offset == instr.get_src(0).get_imm() &&
mem_reg == instr.get_src(1).get_reg();
}
/*!
* Check if the instruction stores an FPR (SWC1)
* "no ll" means that there is no label or linking involved.
*/
bool is_no_ll_fpr_store(const Instruction& instr,
MatchParam<Register> src,
MatchParam<int> offset,
MatchParam<Register> dest) {
return instr.kind == InstructionKind::SWC1 && src == instr.src[0].get_reg() &&
offset == instr.src[1].get_imm() && dest == instr.src[2].get_reg();
}
/*!
* Check if the instruction loads an FPR (LWC1)
* "no ll" means that there is no label or linking involved.
*/
bool is_no_ll_fpr_load(const Instruction& instr,
MatchParam<Register> dst_reg,
MatchParam<int> offset,
MatchParam<Register> mem_reg) {
return instr.kind == InstructionKind::LWC1 && dst_reg == instr.get_dst(0).get_reg() &&
offset == instr.get_src(0).get_imm() && mem_reg == instr.get_src(1).get_reg();
}
namespace {
auto gpr_stores = {InstructionKind::SB, InstructionKind::SH, InstructionKind::SW,
InstructionKind::SD, InstructionKind::SQ};
auto gpr_signed_loads = {InstructionKind::LB, InstructionKind::LH, InstructionKind::LW,
InstructionKind::LD, InstructionKind::LQ};
auto gpr_unsigned_loads = {InstructionKind::LBU, InstructionKind::LHU, InstructionKind::LWU,
InstructionKind::LD, InstructionKind::LQ};
auto gpr_all_loads = {InstructionKind::LBU, InstructionKind::LB, InstructionKind::LH,
InstructionKind::LHU, InstructionKind::LW, InstructionKind::LWU,
InstructionKind::SD, InstructionKind::SQ};
} // namespace
/*!
* Is this a GPR store instruction? sb,sh,sw,sd,sq
*/
bool is_gpr_store(const Instruction& instr) {
for (auto x : gpr_stores) {
if (instr.kind == x) {
return true;
}
}
return false;
}
/*!
* Is this a GPR load instruction?
* Only LB/LBU,LH/LHU,LW/LWU,LD,LQ are treated as loads
* The LD, LQ opcodes are both signed, unsigned, and "wildcard signed"
*/
bool is_gpr_load(const Instruction& instr, MatchParam<bool> is_signed) {
if (is_signed.is_wildcard) {
for (auto x : gpr_all_loads) {
if (instr.kind == x) {
return true;
}
}
return false;
} else if (is_signed.value) {
for (auto x : gpr_signed_loads) {
if (instr.kind == x) {
return true;
}
}
return false;
} else {
for (auto x : gpr_unsigned_loads) {
if (instr.kind == x) {
return true;
}
}
return false;
}
}
/*!
* Given a store, get the offset as an integer.
*/
int32_t get_gpr_store_offset_as_int(const Instruction& instr) {
assert(is_gpr_store(instr));
assert(instr.n_src == 3);
return instr.src[1].get_imm();
}
/*!
* Match an instruction in the form OP, dst, src0, src1 where all args are registers.
*/
bool is_gpr_3(const Instruction& instr,
MatchParam<InstructionKind> kind,
MatchParam<Register> dst,
MatchParam<Register> src0,
MatchParam<Register> src1) {
return kind == instr.kind && dst == instr.get_dst(0).get_reg() &&
src0 == instr.get_src(0).get_reg() && src1 == instr.get_src(1).get_reg();
}
/*!
* Match an instruction in the form OP, dst, src0, src1 where all args are registers, except for
* src1, which is an integer.
*/
bool is_gpr_2_imm_int(const Instruction& instr,
MatchParam<InstructionKind> kind,
MatchParam<Register> dst,
MatchParam<Register> src,
MatchParam<int32_t> imm) {
return kind == instr.kind && dst == instr.get_dst(0).get_reg() &&
src == instr.get_src(0).get_reg() && imm == instr.get_src(1).get_imm();
}
/*!
* Create a Register for a GPR.
*/
Register make_gpr(Reg::Gpr gpr) {
return Register(Reg::GPR, gpr);
}
/*!
* Create a Register for an FPR.
*/
Register make_fpr(int fpr) {
return Register(Reg::FPR, fpr);
}
/*!
* Is this a "nop"? More specifically, it checks for sll r0, r0, 0, the recommended MIPS nop.
*/
bool is_nop(const Instruction& instr) {
return is_gpr_2_imm_int(instr, InstructionKind::SLL, make_gpr(Reg::R0), make_gpr(Reg::R0), 0);
}
/*!
* Is this jr ra?
*/
bool is_jr_ra(const Instruction& instr) {
return instr.kind == InstructionKind::JR && instr.get_src(0).get_reg() == make_gpr(Reg::RA);
}
bool is_branch(const Instruction& instr, MatchParam<bool> likely) {
const auto& info = instr.get_info();
if (likely.is_wildcard) {
return info.is_branch || info.is_branch_likely;
} else if (likely.value) {
return info.is_branch_likely;
} else {
return info.is_branch && !info.is_branch_likely;
}
}
bool is_always_branch(const Instruction& instr) {
if (!is_branch(instr, {})) {
return false;
}
auto r0 = make_gpr(Reg::R0);
if (instr.kind == InstructionKind::BEQ && instr.get_src(0).get_reg() == r0 &&
instr.get_src(1).get_reg() == r0) {
return true;
}
if (instr.kind == InstructionKind::BEQL && instr.get_src(0).get_reg() == r0 &&
instr.get_src(1).get_reg() == r0) {
assert(false);
return true;
}
return false;
}

View File

@ -0,0 +1,69 @@
#ifndef JAK_DISASSEMBLER_INSTRUCTIONMATCHING_H
#define JAK_DISASSEMBLER_INSTRUCTIONMATCHING_H
#include "Instruction.h"
template <typename T>
struct MatchParam {
MatchParam() { is_wildcard = true; }
// intentionally not explicit so you don't have to put MatchParam<whatever>(blah) everywhere
MatchParam(T x) {
value = x;
is_wildcard = false;
}
T value;
bool is_wildcard = true;
bool operator==(const T& other) { return is_wildcard || (value == other); }
bool operator!=(const T& other) { return !(*this == other); }
};
bool is_no_link_gpr_store(const Instruction& instr,
MatchParam<int> size,
MatchParam<Register> src,
MatchParam<int> offset,
MatchParam<Register> dest);
bool is_no_ll_gpr_load(const Instruction& instr,
MatchParam<int> size,
MatchParam<bool> is_signed,
MatchParam<Register> dst_reg,
MatchParam<int> offset,
MatchParam<Register> mem_reg);
bool is_no_ll_fpr_store(const Instruction& instr,
MatchParam<Register> src,
MatchParam<int> offset,
MatchParam<Register> dest);
bool is_no_ll_fpr_load(const Instruction& instr,
MatchParam<Register> dst_reg,
MatchParam<int> offset,
MatchParam<Register> mem_reg);
bool is_gpr_store(const Instruction& instr);
bool is_gpr_load(const Instruction& instr, MatchParam<bool> is_signed);
int32_t get_gpr_store_offset_as_int(const Instruction& instr);
bool is_gpr_3(const Instruction& instr,
MatchParam<InstructionKind> kind,
MatchParam<Register> dst,
MatchParam<Register> src0,
MatchParam<Register> src1);
bool is_gpr_2_imm_int(const Instruction& instr,
MatchParam<InstructionKind> kind,
MatchParam<Register> dst,
MatchParam<Register> src,
MatchParam<int32_t> imm);
bool is_nop(const Instruction& instr);
bool is_jr_ra(const Instruction& instr);
Register make_gpr(Reg::Gpr gpr);
Register make_fpr(int fpr);
bool is_branch(const Instruction& instr, MatchParam<bool> likely);
bool is_always_branch(const Instruction& instr);
#endif // JAK_DISASSEMBLER_INSTRUCTIONMATCHING_H

View File

@ -0,0 +1,499 @@
#include "OpcodeInfo.h"
#include <cassert>
OpcodeInfo gOpcodeInfo[(uint32_t)InstructionKind::EE_OP_MAX];
typedef InstructionKind IK;
typedef FieldType FT;
typedef DecodeType DT;
static OpcodeInfo& def(IK k, const char* name) {
gOpcodeInfo[(uint32_t)k].defined = true;
gOpcodeInfo[(uint32_t)k].name = name;
return gOpcodeInfo[(uint32_t)k];
}
static OpcodeInfo& def_branch(IK k, const char* name) {
auto& result = def(k, name);
result.is_branch = true;
result.has_delay_slot = true;
return result;
}
static OpcodeInfo& def_branch_likely(IK k, const char* name) {
auto& result = def(k, name);
result.is_branch = true;
result.is_branch_likely = true;
result.has_delay_slot = true;
return result;
}
static OpcodeInfo& def_store(IK k, const char* name) {
auto& result = def(k, name);
result.is_store = true;
return result;
}
static OpcodeInfo& def_load(IK k, const char* name) {
auto& result = def(k, name);
result.is_load = true;
return result;
}
static OpcodeInfo& drt_srs_ssimm16(OpcodeInfo& info) {
return info.dst_gpr(FT::RT).src_gpr(FT::RS).src(FT::SIMM16, DT::IMM);
}
static OpcodeInfo& srt_ssimm16_srs(OpcodeInfo& info) {
return info.src_gpr(FT::RT).src(FT::SIMM16, DT::IMM).src_gpr(FT::RS);
}
static OpcodeInfo& drt_ssimm16_srs(OpcodeInfo& info) {
return info.dst_gpr(FT::RT).src(FT::SIMM16, DT::IMM).src_gpr(FT::RS);
}
static OpcodeInfo& drd_srs_srt(OpcodeInfo& info) {
return info.dst_gpr(FT::RD).src_gpr(FT::RS).src_gpr(FT::RT);
}
static OpcodeInfo& drd_srt_srs(OpcodeInfo& info) {
return info.dst_gpr(FT::RD).src_gpr(FT::RT).src_gpr(FT::RS);
}
static OpcodeInfo& drd_srt_ssa(OpcodeInfo& info) {
return info.dst_gpr(FT::RD).src_gpr(FT::RT).src(FT::SA, DT::IMM);
}
static OpcodeInfo& srs_srt_bt(OpcodeInfo& info) {
return info.src_gpr(FT::RS).src_gpr(FT::RT).src(FT::SIMM16, DT::BRANCH_TARGET);
}
static OpcodeInfo& srs_bt(OpcodeInfo& info) {
return info.src_gpr(FT::RS).src(FT::SIMM16, DT::BRANCH_TARGET);
}
static OpcodeInfo& bt(OpcodeInfo& info) {
return info.src(FT::SIMM16, DT::BRANCH_TARGET);
}
static OpcodeInfo& dfd_sfs_sft(OpcodeInfo& info) {
return info.dst_fpr(FT::FD).src_fpr(FT::FS).src_fpr(FT::FT);
}
static OpcodeInfo& sfs_sft(OpcodeInfo& info) {
return info.src_fpr(FT::FS).src_fpr(FT::FT);
}
static OpcodeInfo& dfd_sfs(OpcodeInfo& info) {
return info.dst_fpr(FT::FD).src_fpr(FT::FS);
}
static OpcodeInfo& drd(OpcodeInfo& info) {
return info.dst_gpr(FT::RD);
}
static OpcodeInfo& cd_dvft_svfs(OpcodeInfo& info) {
return info.src(FT::DEST, DT::DEST).dst_vf(FT::FT).src_vf(FT::FS);
}
static OpcodeInfo& cd_dvfd_svfs_svft(OpcodeInfo& info) {
return info.src(FT::DEST, DT::DEST).dst_vf(FT::FD).src_vf(FT::FS).src_vf(FT::FT);
}
static OpcodeInfo& cb_cd_dvfd_svfs_svft(OpcodeInfo& info) {
return info.src(FT::BC, DT::BC)
.src(FT::DEST, DT::DEST)
.dst_vf(FT::FD)
.src_vf(FT::FS)
.src_vf(FT::FT);
}
static OpcodeInfo& cb_cd_dacc_svfs_svft(OpcodeInfo& info) {
return info.src(FT::BC, DT::BC)
.src(FT::DEST, DT::DEST)
.dst(FT::ZERO, DT::VU_ACC)
.src_vf(FT::FS)
.src_vf(FT::FT);
}
static OpcodeInfo& cd_dvfd_svfs_sq(OpcodeInfo& info) {
return info.src(FT::DEST, DT::DEST).dst_vf(FT::FD).src_vf(FT::FS).src(FT::ZERO, DT::VU_Q);
}
static OpcodeInfo& cd_dacc_svfs_svft(OpcodeInfo& info) {
return info.src(FT::DEST, DT::DEST).dst(FT::ZERO, DT::VU_ACC).src_vf(FT::FS).src_vf(FT::FT);
}
void init_opcode_info() {
gOpcodeInfo[0].name = ";; ??????";
// RT, RS, SIMM
drt_srs_ssimm16(def(IK::DADDIU, "daddiu")); // Doubleword Add Immediate Unsigned
drt_srs_ssimm16(def(IK::ADDIU, "addiu")); // Add Immediate Unsigned Word
drt_srs_ssimm16(def(IK::SLTI, "slti")); // Set on Less Than Immediate
drt_srs_ssimm16(def(IK::SLTIU, "sltiu")); // Set on Less Than Immediate Unsigned
// stores in srt_ssimm16_srs
srt_ssimm16_srs(def_store(IK::SB, "sb")); // Store Byte
srt_ssimm16_srs(def_store(IK::SH, "sh")); // Store Halfword
srt_ssimm16_srs(def_store(IK::SW, "sw")); // Store Word
srt_ssimm16_srs(def_store(IK::SD, "sd")); // Store Doubleword
srt_ssimm16_srs(def_store(IK::SQ, "sq")); // Store Quadword
// loads in dsrt_ssimm16_srs
drt_ssimm16_srs(def_load(IK::LB, "lb")); // Load Byte
drt_ssimm16_srs(def_load(IK::LBU, "lbu")); // Load Byte Unsigned
drt_ssimm16_srs(def_load(IK::LH, "lh")); // Load Halfword
drt_ssimm16_srs(def_load(IK::LHU, "lhu")); // Load Halfword Unsigned
drt_ssimm16_srs(def_load(IK::LW, "lw")); // Load Word
drt_ssimm16_srs(def_load(IK::LWU, "lwu")); // Load Word Unsigned
drt_ssimm16_srs(def_load(IK::LD, "ld")); // Load Doubleword
drt_ssimm16_srs(def_load(IK::LQ, "lq")); // Load Quadword
drt_ssimm16_srs(def_load(IK::LDR, "ldr")); // Load Doubleword Left
drt_ssimm16_srs(def_load(IK::LDL, "ldl")); // Load Doubleword Right
drt_ssimm16_srs(def_load(IK::LWL, "lwl")); // Load Word Left
drt_ssimm16_srs(def_load(IK::LWR, "lwr")); // Load Word Right
// drd_srs_srt
drd_srs_srt(def(IK::DADDU, "daddu")); // Doubleword Add Unsigned
drd_srs_srt(def(IK::SUBU, "subu")); // Subtract Unsigned Word
drd_srs_srt(def(IK::ADDU, "addu")); // Add Unsigned Word
drd_srs_srt(def(IK::DSUBU, "dsubu")); // Doubleword Subtract Unsigned
drd_srs_srt(def(IK::MULT3, "mult3")); // Multiply Word
drd_srs_srt(def(IK::MULTU3, "multu3")); // Multiply Unsigned Word
drd_srs_srt(def(IK::AND, "and")); // And
drd_srs_srt(def(IK::OR, "or")); // Or
drd_srs_srt(def(IK::NOR, "nor")); // Not Or
drd_srs_srt(def(IK::XOR, "xor")); // Exclusive Or
drd_srs_srt(def(IK::MOVN, "movn")); // Move Conditional on Not Zero
drd_srs_srt(def(IK::MOVZ, "movz")); // Move Conditional on Zero
drd_srs_srt(def(IK::SLT, "slt")); // Set on Less Than
drd_srs_srt(def(IK::SLTU, "sltu")); // Set on Less Than Unsigned
// fixed shifts
drd_srt_ssa(def(IK::SLL, "sll")); // Shift Left Logical
drd_srt_ssa(def(IK::SRA, "sra")); // Shift Right Arithmetic
drd_srt_ssa(def(IK::SRL, "srl")); // Shift Right Logical
drd_srt_ssa(def(IK::DSLL, "dsll")); // Doubleword Shift Left Logical
drd_srt_ssa(def(IK::DSLL32, "dsll32")); // Doubleword Shift Left Logical Plus 32
drd_srt_ssa(def(IK::DSRA, "dsra")); // Doubleword Shift Right Arithmetic
drd_srt_ssa(def(IK::DSRA32, "dsra32")); // Doubleword Shift Right Arithmetic Plus 32
drd_srt_ssa(def(IK::DSRL, "dsrl")); // Doubleword Shift Right Logical
drd_srt_ssa(def(IK::DSRL32, "dsrl32")); // Doubleword Shift Right Logical Plus 32
// variable shifts
drd_srt_srs(def(IK::DSRAV, "dsrav")); // Doubleword Shift Right Arithmetic Variable
drd_srt_srs(def(IK::SLLV, "sllv")); // Shift Word Left Logical Variable
drd_srt_srs(def(IK::DSLLV, "dsllv")); // Doubleword Shift Left Logical Variable
drd_srt_srs(def(IK::DSRLV, "dsrlv")); // Doubleword Shift Right Logical Variable
// branch (two registers)
srs_srt_bt(def_branch(IK::BEQ, "beq")); // Branch on Equal
srs_srt_bt(def_branch(IK::BNE, "bne")); // Branch on Not Equal
srs_srt_bt(def_branch_likely(IK::BEQL, "beql")); // Branch on Equal Likely
srs_srt_bt(def_branch_likely(IK::BNEL, "bnel")); // Branch on Not Equal Likely
// branch (one register)
srs_bt(def_branch(IK::BLTZ, "bltz")); // Branch on Less Than Zero
srs_bt(def_branch(IK::BGEZ, "bgez")); // Branch on Greater Than or Equal to Zero
srs_bt(def_branch(IK::BLEZ, "blez")); // Branch on Less Than or Equal to Zero
srs_bt(def_branch(IK::BGTZ, "bgtz")); // Branch on Greater Than Zero
srs_bt(def_branch(IK::BGEZAL, "bgezal")); // Branch on Greater Than or Equal to Zero and Link
srs_bt(def_branch_likely(IK::BLTZL, "bltzl")); // Branch on Less Than Zero Likely
srs_bt(def_branch_likely(IK::BGTZL, "bgtzl")); // Branch on Greater Than Zero Likely
srs_bt(def_branch_likely(IK::BGEZL, "bgezl")); // Branch on Greater Than or Equal to Zero Likely
// weird ones
def(IK::DIV, "div").src_gpr(FT::RS).src_gpr(FT::RT); // Divide Word
def(IK::DIVU, "divu").src_gpr(FT::RS).src_gpr(FT::RT); // Divide Unsigned Word
def(IK::ORI, "ori").dst_gpr(FT::RT).src_gpr(FT::RS).src(FT::ZIMM16, DT::IMM); // Or Immediate
def(IK::XORI, "xori")
.dst_gpr(FT::RT)
.src_gpr(FT::RS)
.src(FT::ZIMM16, DT::IMM); // Exclusive Or Immediate
def(IK::ANDI, "andi").dst_gpr(FT::RT).src_gpr(FT::RS).src(FT::ZIMM16, DT::IMM); // And Immediate
def(IK::LUI, "lui").dst_gpr(FT::RT).src(FT::SIMM16, DT::IMM); // Load Upper Immediate
def(IK::JALR, "jalr").dst_gpr(FT::RD).src_gpr(FT::RS).has_delay_slot =
true; // Jump and Link Register
def(IK::JR, "jr").src_gpr(FT::RS).has_delay_slot = true; // Jump Register
def_load(IK::LWC1, "lwc1")
.dst_fpr(FT::FT)
.src(FT::SIMM16, DT::IMM)
.src_gpr(FT::RS); // Load Word to Floating Point
def_store(IK::SWC1, "swc1")
.src_fpr(FT::FT)
.src(FT::SIMM16, DT::IMM)
.src_gpr(FT::RS); // Store Word from Floating Point
// weird moves
def(IK::MFC1, "mfc1").dst_gpr(FT::RT).src_fpr(FT::FS); // Move Word from Floating Point
def(IK::MTC1, "mtc1").src_gpr(FT::RT).dst_fpr(FT::FS); // Move Word to Floating Point
def(IK::MTC0, "mtc0")
.src_gpr(FT::RT)
.dst(FT::RD, DT::COP0); // Move to System Control Coprocessor
def(IK::MFC0, "mfc0")
.dst_gpr(FT::RT)
.src(FT::RD, DT::COP0); // Move from System Control Coprocessor
def(IK::MTDAB, "mtdab").src_gpr(FT::RT); // Move to Data Address Breakpoint Register
def(IK::MTDABM, "mtdabm").src_gpr(FT::RT); // Move to Data Address Breakpoint Mask Register
drd(def(IK::MFHI, "mfhi")); // Move from HI Register
drd(def(IK::MFLO, "mflo")); // Move from LO Register
def(IK::MTLO1, "mtlo1").src_gpr(FT::RS); // Move to LO1 Register
drd(def(IK::MFLO1, "mflo1")); // Move from LO1 Register
drd(def(IK::PMFHL_UW, "pmfhl.uw")); // Parallel Move From HI/LO Register
drd(def(IK::PMFHL_LW, "pmfhl.lw"));
drd(def(IK::PMFHL_LH, "pmfhl.lh"));
def(IK::MFPC, "mfpc").dst_gpr(FT::RT).src(FT::PCR, DT::PCR); // Move from Performance Counter
def(IK::MTPC, "mtpc").src_gpr(FT::RT).dst(FT::PCR, DT::PCR); // Move to Performance Counter
// other weirds
def(IK::SYSCALL, "syscall").src(FT::SYSCALL, DT::IMM); // System Call
def(IK::CACHE_DXWBIN, "cache dxwbin")
.src_gpr(FT::RS)
.src(FT::SIMM16, DT::IMM); // Cache Operation (Index Writeback Invalidate)
def(IK::PREF, "pref").src_gpr(FT::RT).src(FT::SIMM16, DT::IMM).src_gpr(FT::RS); // Prefetch
// plains
def(IK::SYNCP, "sync.p"); // Synchronize Shared Memory (Pipeline)
def(IK::SYNCL, "sync.l"); // Synchronize Shared Memory (Load)
def(IK::ERET, "eret"); // Exception Return
def(IK::EI, "ei"); // Enable Interrupt
drd_srs_srt(def(IK::PPACB, "ppacb")); // Parallel Pack to Byte
drd_srs_srt(def(IK::PPACH, "ppach")); // Parallel Pack to Halfword
drd_srs_srt(def(IK::PPACW, "ppacw")); // Parallel Pack to Word
drd_srs_srt(def(IK::PADDH, "paddh")); // Parallel Add Halfword
drd_srs_srt(def(IK::PADDW, "paddw")); // Parallel Add Word
drd_srs_srt(def(IK::PSUBW, "psubw")); // Parallel Subtract Word
drd_srs_srt(def(IK::PMINH, "pminh")); // Parallel Minimize Halfword
drd_srs_srt(def(IK::PMINW, "pminw")); // Parallel Minimize Word
drd_srs_srt(def(IK::PMAXH, "pmaxh")); // Parallel Maximize Halfword
drd_srs_srt(def(IK::PMAXW, "pmaxw")); // Parallel Maximize Word
drd_srs_srt(def(IK::PEXTLB, "pextlb")); // Parallel Extend Lower from Byte
drd_srs_srt(def(IK::PEXTLH, "pextlh")); // Parallel Extend Lower from Halfword
drd_srs_srt(def(IK::PEXTLW, "pextlw")); // Parallel Extend Lower from Word
drd_srs_srt(def(IK::PCGTW, "pcgtw")); // Parallel Compare for Greater Than Word
drd_srs_srt(def(IK::PCEQB, "pceqb")); // Parallel Compare for Equal Byte
drd_srs_srt(def(IK::PCEQW, "pceqw")); // Parallel Compare for Equal Word
drd_srs_srt(def(IK::PEXTUB, "pextub")); // Parallel Extend Upper from Byte
drd_srs_srt(def(IK::PEXTUH, "pextuh")); // Parallel Extend Upper from Halfword
drd_srs_srt(def(IK::PEXTUW, "pextuw")); // Parallel Extend Upper from Word
drd_srs_srt(def(IK::PCPYUD, "pcpyud")); // Parallel Copy Upper Doubleword
drd_srs_srt(def(IK::PCPYLD, "pcpyld")); // Parallel Copy Lower Doubleword
drd_srs_srt(def(IK::PMADDH, "pmaddh")); // Parallel Multiply-Add Halfword
drd_srs_srt(def(IK::PMULTH, "pmulth")); // Parallel Multiply Halfword
drd_srs_srt(def(IK::PEXEW, "pexew")); // Parallel Exchange Even Word
drd_srs_srt(def(IK::PINTEH, "pinteh")); // Parallel Interleave Even Halfword
drd_srs_srt(def(IK::PAND, "pand")); // Parallel And
drd_srs_srt(def(IK::POR, "por")); // Parallel Or
drd_srs_srt(def(IK::PNOR, "pnor")); // Parallel Not Or
drd_srt_ssa(def(IK::PSLLW, "psllw")); // Parallel Shift Left Logical Word
drd_srt_ssa(def(IK::PSLLH, "psllh")); // Parallel Shift Left Logical Halfword
drd_srt_ssa(def(IK::PSRAW, "psraw")); // Parallel Shift Right Arithmetic Word
drd_srt_ssa(def(IK::PSRAH, "psrah")); // Parallel Shift Right Arithmetic Halfword
drd_srt_ssa(def(IK::PSRLH, "psrlh")); // Parallel Shift Right Logical Halfword
def(IK::PLZCW, "plzcw").dst_gpr(FT::RD).src_gpr(FT::RS); // Parallel Leading Zero Count Word
def(IK::PABSW, "pabsw").dst_gpr(FT::RD).src_gpr(FT::RT); // Parallel Absolute Word
def(IK::PROT3W, "prot3w").dst_gpr(FT::RD).src_gpr(FT::RT); // Parallel Rotate 3 Word
def(IK::PCPYH, "pcpyh").dst_gpr(FT::RD).src_gpr(FT::RT); // Parallel Copy Halfword
// COP1
// branch (no registers)
bt(def_branch(IK::BC1F, "bc1f")); // Branch on FP False
bt(def_branch(IK::BC1T, "bc1t")); // Branch on FP True
bt(def_branch_likely(IK::BC1FL, "bc1fl")); // Branch on FP False Likely
bt(def_branch_likely(IK::BC1TL, "bc1tl")); // Branch on FP True Likely
dfd_sfs_sft(def(IK::ADDS, "add.s")); // Floating Point Add
dfd_sfs_sft(def(IK::SUBS, "sub.s")); // Floating Point Subtract
dfd_sfs_sft(def(IK::MULS, "mul.s")); // Floating Point Multiply
dfd_sfs_sft(def(IK::DIVS, "div.s")); // Floating Point Divide
dfd_sfs_sft(def(IK::MINS, "min.s")); // Floating Point Minimum
dfd_sfs_sft(def(IK::MAXS, "max.s")); // Floating Point Maximum
dfd_sfs_sft(def(IK::MADDS, "madd.s")); // Floating Point Multiply-Add
dfd_sfs_sft(def(IK::MSUBS, "msub.s")); // Floating Point Multiply and Subtract
dfd_sfs_sft(def(IK::RSQRTS, "rsqrt.s")); // Floating Point Reciporcal Square Root
dfd_sfs(def(IK::ABSS, "abs.s")); // Floating Point Absolute Value
dfd_sfs(def(IK::NEGS, "neg.s")); // Floating Point Negate
dfd_sfs(def(IK::CVTSW, "cvt.s.w")); // Fixed-point Convert to Single Floating Point
dfd_sfs(def(IK::CVTWS, "cvt.w.s")); // Floating Point Convert to Word Fixed-point
dfd_sfs(def(IK::MOVS, "mov.s")); // Floating Point Move
dfd_sfs(def(IK::SQRTS, "sqrt.s")); // Floating Point Square Root
sfs_sft(def(IK::CLTS, "c.lt.s")); // Floating Point Compare
sfs_sft(def(IK::CLES, "c.le.s")); // Floating Point Compare
sfs_sft(def(IK::CEQS, "c.eq.s")); // Floating Point Compare
sfs_sft(def(IK::MULAS, "mula.s")); // Floating Point Multiply to Accumulator
sfs_sft(def(IK::MADDAS, "madda.s")); // Floating Point Multiply-Add to Accumulator
sfs_sft(def(IK::ADDAS, "adda.s")); // Floating Point Add to Accumulator
sfs_sft(def(IK::MSUBAS, "msuba.s")); // Floating Point Multiply and Subtract from Accumulator
// COP2 weirds
def_store(IK::SQC2, "sqc2")
.src(FT::FT, DT::VF)
.src(FT::SIMM16, DT::IMM)
.src_gpr(FT::RS); // Store Quadword from COP2
def_load(IK::LQC2, "lqc2")
.dst(FT::FT, DT::VF)
.src(FT::SIMM16, DT::IMM)
.src_gpr(FT::RS); // Load Quadword to COP2
// COP2
cd_dvft_svfs(def(IK::VMOVE, "vmove")); // Transfer between Floating-Point Registers
cd_dvft_svfs(def(IK::VFTOI0, "vftoi0")); // Conversion to Fixed Point
cd_dvft_svfs(def(IK::VFTOI4, "vftoi4")); // Conversion to Fixed Point
cd_dvft_svfs(def(IK::VFTOI12, "vftoi12")); // Conversion to Fixed Point
cd_dvft_svfs(def(IK::VITOF0, "vitof0")); // Conversion to Floating Point Number
cd_dvft_svfs(def(IK::VITOF12, "vitof12")); // Conversion to Floating Point Number
cd_dvft_svfs(def(IK::VITOF15, "vitof15")); // Conversion to Floating Point Number
cd_dvft_svfs(def(IK::VABS, "vabs")); // Absolute Value
cd_dvfd_svfs_svft(def(IK::VADD, "vadd"));
cd_dvfd_svfs_svft(def(IK::VSUB, "vsub"));
cd_dvfd_svfs_svft(def(IK::VMUL, "vmul"));
cd_dvfd_svfs_svft(def(IK::VMINI, "vmini"));
cd_dvfd_svfs_svft(def(IK::VMAX, "vmax"));
cd_dvfd_svfs_svft(def(IK::VOPMSUB, "vopmsub"));
cd_dvfd_svfs_svft(def(IK::VMADD, "vmadd"));
cd_dvfd_svfs_svft(def(IK::VMSUB, "vmsub"));
cb_cd_dvfd_svfs_svft(def(IK::VSUB_BC, "vsub"));
cb_cd_dvfd_svfs_svft(def(IK::VADD_BC, "vadd"));
cb_cd_dvfd_svfs_svft(def(IK::VMADD_BC, "vmadd"));
cb_cd_dvfd_svfs_svft(def(IK::VMSUB_BC, "vmsub"));
cb_cd_dvfd_svfs_svft(def(IK::VMUL_BC, "vmul"));
cb_cd_dvfd_svfs_svft(def(IK::VMINI_BC, "vmini"));
cb_cd_dvfd_svfs_svft(def(IK::VMAX_BC, "vmax"));
cb_cd_dacc_svfs_svft(def(IK::VADDA_BC, "vadda"));
cb_cd_dacc_svfs_svft(def(IK::VMADDA_BC, "vmadda"));
cb_cd_dacc_svfs_svft(def(IK::VMULA_BC, "vmula"));
cb_cd_dacc_svfs_svft(def(IK::VMSUBA_BC, "vmsuba"));
cd_dvfd_svfs_sq(def(IK::VADDQ, "vaddq"));
cd_dvfd_svfs_sq(def(IK::VSUBQ, "vsubq"));
cd_dvfd_svfs_sq(def(IK::VMULQ, "vmulq"));
cd_dvfd_svfs_sq(def(IK::VMSUBQ, "vmsubq"));
cd_dacc_svfs_svft(def(IK::VMULA, "vmula"));
cd_dacc_svfs_svft(def(IK::VADDA, "vadda"));
cd_dacc_svfs_svft(def(IK::VMADDA, "vmadda"));
cd_dacc_svfs_svft(def(IK::VOPMULA, "vopmula"));
// weird
def(IK::VDIV, "vdiv")
.dst(FT::ZERO, DT::VU_Q)
.src_vf(FT::FS)
.src_vf(FT::FT)
.src(FT::BC, DT::BC); // todo
def(IK::VRSQRT, "vrsqrt")
.dst(FT::ZERO, DT::VU_Q)
.src_vf(FT::FS)
.src_vf(FT::FT)
.src(FT::BC, DT::BC); // todo
def(IK::VCLIP, "vclip").src(FT::DEST, DT::DEST).src_vf(FT::FS).src_vf(FT::FT);
def(IK::VMULAQ, "vmulaq")
.src(FT::DEST, DT::DEST)
.dst(FT::ZERO, DT::VU_ACC)
.src_vf(FT::FS)
.src(FT::ZERO, DT::VU_Q);
def(IK::VRGET, "vrget").src(FT::DEST, DT::DEST).dst_vf(FT::FT);
// integer
def(IK::VMTIR, "vmtir").dst(FT::RT, DT::VI).src_vf(FT::FS).src(FT::BC, DT::BC);
def(IK::VIAND, "viand").dst_vi(FT::FD).src_vi(FT::FS).src_vi(FT::FT);
def(IK::VLQI, "vlqi").src(FT::DEST, DT::DEST).dst_vf(FT::FT).src_vi(FT::FS); // todo inc
def(IK::VSQI, "vsqi").src(FT::DEST, DT::DEST).src_vf(FT::FS).src_vi(FT::FT); // todo inc
def(IK::VIADDI, "viaddi").dst_vi(FT::FT).src_vi(FT::FS).src(FT::IMM5, DT::IMM);
def(IK::QMFC2, "qmfc2").src(FT::IL, DT::IL).dst_gpr(FT::RT).src_vf(FT::FS);
def(IK::QMTC2, "qmtc2").src(FT::IL, DT::IL).src_gpr(FT::RT).dst_vf(FT::FS);
def(IK::VSQRT, "vsqrt").src(FT::BC, DT::BC).dst(FT::ZERO, DT::VU_Q).src_vf(FT::FT);
def(IK::VRXOR, "vrxor").src(FT::BC, DT::BC).src_vf(FT::FS);
def(IK::VRNEXT, "vrnext").src(FT::DEST, DT::DEST).dst_vf(FT::FT);
def(IK::CTC2, "ctc2").src(FT::IL, DT::IL).src_gpr(FT::RT).dst(FT::RD, DT::VI);
def(IK::CFC2, "cfc2").src(FT::IL, DT::IL).dst_gpr(FT::RT).src(FT::RD, DT::VI);
def(IK::VCALLMS, "vcallms").src(FT::IMM15, DT::VCALLMS_TARGET);
def(IK::VNOP, "vnop");
def(IK::VWAITQ, "vwaitq");
uint32_t valid_count = 0, total_count = 0;
for (auto& info : gOpcodeInfo) {
if (info.defined) {
valid_count++;
}
total_count++;
}
// for the UNKNOWN op which shouldn't be valid.
total_count--;
assert(total_count == valid_count);
}
void OpcodeInfo::step(DecodeStep& s) {
assert(step_count < MAX_DECODE_STEPS);
steps[step_count] = s;
step_count++;
defined = true;
}
OpcodeInfo& OpcodeInfo::src(FieldType field, DecodeType decode) {
DecodeStep new_step;
new_step.is_src = true;
new_step.field = field;
new_step.decode = decode;
step(new_step);
return *this;
}
OpcodeInfo& OpcodeInfo::src_gpr(FieldType field) {
return src(field, DT::GPR);
}
OpcodeInfo& OpcodeInfo::src_fpr(FieldType field) {
return src(field, DT::FPR);
}
OpcodeInfo& OpcodeInfo::src_vf(FieldType field) {
return src(field, DT::VF);
}
OpcodeInfo& OpcodeInfo::src_vi(FieldType field) {
return src(field, DT::VI);
}
OpcodeInfo& OpcodeInfo::dst(FieldType field, DecodeType decode) {
DecodeStep new_step;
new_step.is_src = false;
new_step.field = field;
new_step.decode = decode;
step(new_step);
return *this;
}
OpcodeInfo& OpcodeInfo::dst_gpr(FieldType field) {
return dst(field, DT::GPR);
}
OpcodeInfo& OpcodeInfo::dst_fpr(FieldType field) {
return dst(field, DT::FPR);
}
OpcodeInfo& OpcodeInfo::dst_vf(FieldType field) {
return dst(field, DT::VF);
}
OpcodeInfo& OpcodeInfo::dst_vi(FieldType field) {
return dst(field, DT::VI);
}

View File

@ -0,0 +1,351 @@
/*!
* @file OpcodeInfo.h
* Decoding info for each opcode.
*/
#ifndef NEXT_OPCODEINFO_H
#define NEXT_OPCODEINFO_H
#include <string>
enum class InstructionKind {
UNKNOWN,
// Integer Math
ADDU, // Add Unsigned Word
ADDIU, // Add Immediate Unsigned Word
DADDU,
DADDIU, // Doubleword Add Immediate Unsigned
SUBU,
DSUBU,
MULT3, // special EE three-operand multiply
MULTU3,
DIV,
DIVU,
// Stores
SB,
SH,
SW,
SWC1,
SD,
SQ,
SQC2,
// Loads
LB,
LBU,
LH,
LHU,
LW,
LWU,
LWL,
LWR,
LWC1,
LD,
LDL,
LDR,
LQ,
LQC2,
LUI,
// Logical
AND,
ANDI,
OR,
ORI,
XOR,
XORI,
NOR,
// Moves
MOVN,
MOVZ,
MFHI,
MFLO,
MFLO1,
MTLO1,
MFPC,
MTPC,
MTC0,
MFC0,
MTDAB,
MTDABM,
MFC1,
MTC1,
QMFC2,
QMTC2,
CTC2,
CFC2,
// Jumps
JALR,
JR,
// Branch
BEQ,
BEQL,
BNE,
BNEL,
BLTZ,
BLTZL,
BGTZ,
BGTZL,
BGEZ,
BGEZL,
BLEZ,
BGEZAL,
// Shift
SLL,
SLLV,
SRL,
SRA,
DSLL,
DSLL32,
DSLLV,
DSRL,
DSRL32,
DSRLV,
DSRA,
DSRA32,
DSRAV,
// Compare
SLT,
SLTI,
SLTU,
SLTIU,
// Weird
SYSCALL,
SYNCP,
SYNCL,
ERET,
EI,
CACHE_DXWBIN,
PREF,
// MMI unsorted
PSLLW,
PSRAW,
PSRAH,
PLZCW,
PMFHL_UW,
PMFHL_LW,
PMFHL_LH,
PSLLH,
PSRLH,
// MMI 0
PEXTLW,
PPACH,
PSUBW,
PCGTW,
PEXTLH,
PEXTLB,
PMAXH,
PPACB,
PADDW,
PADDH,
PMAXW,
PPACW,
// MMI 1
PCEQW,
PEXTUW,
PMINH,
PEXTUH,
PEXTUB,
PCEQB,
PMINW,
PABSW,
// MMI 2
PCPYLD,
PROT3W,
PAND,
PMADDH,
PMULTH,
PEXEW,
// MMI 3
POR,
PCPYUD,
PNOR,
PCPYH,
PINTEH,
// COP1 / FPU
ADDS,
SUBS,
MULS,
DIVS,
MINS,
MAXS,
ABSS,
NEGS,
CVTSW,
CVTWS,
CLTS,
CLES,
CEQS,
BC1F,
BC1T,
BC1FL,
BC1TL,
MULAS,
MADDAS,
ADDAS,
MSUBAS,
MADDS,
MSUBS,
MOVS,
SQRTS,
RSQRTS,
// COP2
VMOVE,
VFTOI0,
VFTOI4,
VFTOI12,
VITOF0,
VITOF12,
VITOF15,
VABS,
VADD,
VSUB,
VMUL,
VMINI,
VMAX,
VOPMSUB,
VMADD,
VMSUB,
VADD_BC,
VSUB_BC,
VMUL_BC,
VMULA_BC,
VMADD_BC,
VADDA_BC,
VMADDA_BC,
VMSUBA_BC,
VMSUB_BC,
VMINI_BC,
VMAX_BC,
VADDQ,
VSUBQ,
VMULQ,
VMSUBQ,
VMULA,
VADDA,
VMADDA,
VOPMULA,
VDIV,
VCLIP,
VMULAQ,
VMTIR,
VIAND,
VLQI,
VIADDI,
VSQI,
VRGET,
VSQRT,
VRSQRT,
VRXOR,
VRNEXT,
VNOP,
VWAITQ,
VCALLMS,
EE_OP_MAX
};
enum class FieldType {
RS,
RT,
RD,
SA,
FT,
FS,
FD,
SYSCALL,
SIMM16,
ZIMM16,
PCR,
DEST,
BC,
IMM5,
IMM15,
IL,
ZERO
};
enum class DecodeType {
GPR,
IMM,
FPR,
COP0,
COP2,
PCR,
VF,
VI,
BRANCH_TARGET,
VCALLMS_TARGET,
DEST,
BC,
VU_Q,
VU_ACC,
IL
};
struct DecodeStep {
bool is_src = false;
FieldType field;
DecodeType decode;
};
constexpr int MAX_DECODE_STEPS = 5;
struct OpcodeInfo {
std::string name;
bool is_branch = false;
bool is_branch_likely = false;
bool can_lo16_link = false;
bool defined = false;
bool is_store = false;
bool is_load = false;
bool has_delay_slot = false;
void step(DecodeStep& s);
OpcodeInfo& src(FieldType field, DecodeType decode);
OpcodeInfo& src_gpr(FieldType field);
OpcodeInfo& src_fpr(FieldType field);
OpcodeInfo& src_vf(FieldType field);
OpcodeInfo& src_vi(FieldType field);
OpcodeInfo& dst(FieldType field, DecodeType decode);
OpcodeInfo& dst_gpr(FieldType field);
OpcodeInfo& dst_fpr(FieldType field);
OpcodeInfo& dst_vf(FieldType field);
OpcodeInfo& dst_vi(FieldType field);
uint8_t step_count;
DecodeStep steps[MAX_DECODE_STEPS];
};
extern OpcodeInfo gOpcodeInfo[(uint32_t)InstructionKind::EE_OP_MAX];
void init_opcode_info();
#endif // NEXT_OPCODEINFO_H

View File

@ -0,0 +1,215 @@
/*!
* @file Register.cpp
* Representation of an EE register.
*/
#include "Register.h"
#include <cassert>
////////////////////////////
// Register Name Constants
////////////////////////////
const static char* gpr_names[32] = {
"r0", "at", "v0", "v1", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "t8", "t9", "k0", "k1", "gp", "sp", "fp", "ra"};
const static char* fpr_names[32] = {"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
"f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
"f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"};
const static char* cop0_names[32] = {
"Index", "Random", "EntryLo0", "EntryLo1", "Context", "PageMask", "Wired",
"INVALID7", "BadVAddr", "Count", "EntryHi", "Compare", "Status", "Cause",
"EPC", "PRId", "Config", "INVALID17", "INVALID18", "INVALID19", "INVALID20",
"INVALID21", "INVALID22", "BadPAddr", "Debug", "Perf", "INVALID26", "INVALID27",
"TagLo", "TagHi", "ErrorEPR", "INVALID31"};
const static char* vf_names[32] = {"vf0", "vf1", "vf2", "vf3", "vf4", "vf5", "vf6", "vf7",
"vf8", "vf9", "vf10", "vf11", "vf12", "vf13", "vf14", "vf15",
"vf16", "vf17", "vf18", "vf19", "vf20", "vf21", "vf22", "vf23",
"vf24", "vf25", "vf26", "vf27", "vf28", "vf29", "vf30", "vf31"};
const static char* vi_names[32] = {
"vi0", "vi1", "vi2", "vi3", "vi4", "vi5", "vi6", "vi7",
"vi8", "vi9", "vi10", "vi11", "vi12", "vi13", "vi14", "vi15",
"Status", "MAC", "Clipping", "INVALID3", "R", "I", "Q", "INVALID7",
"INVALID8", "INVALID9", "TPC", "CMSAR0", "FBRST", "VPU-STAT", "INVALID14", "CMSAR1"};
const static char* pcr_names[2] = {"pcr0", "pcr1"};
/////////////////////////////
// Register Names Conversion
/////////////////////////////
namespace {
const char* gpr_to_charp(Reg::Gpr gpr) {
assert(gpr < 32);
return gpr_names[gpr];
}
const char* fpr_to_charp(uint32_t fpr) {
assert(fpr < 32);
return fpr_names[fpr];
}
const char* cop0_to_charp(Reg::Cop0 cpr) {
assert(cpr < 32);
return cop0_names[cpr];
}
const char* vf_to_charp(uint32_t vf) {
assert(vf < 32);
return vf_names[vf];
}
const char* vi_to_charp(uint32_t vi) {
assert(vi < 32);
return vi_names[vi];
}
const char* pcr_to_charp(uint32_t pcr) {
assert(pcr < 2);
return pcr_names[pcr];
}
} // namespace
/////////////////////////////
// Register Class
/////////////////////////////
// A register is stored as a 16-bit integer, with the top 8 bits indicating the "kind" and the lower
// 8 bits representing the register id within that kind. If the integer is -1, it is a special
// "invalid" register used to represent an uninitialized Register.
// Note: VI / COP2 are separate "kinds" of registers, each with 16 registers.
// It might make sense to make this a single "kind" instead?
/*!
* Create a register. The kind and num must both be valid.
*/
Register::Register(Reg::RegisterKind kind, uint32_t num) {
id = (kind << 8) | num;
// check range:
switch (kind) {
case Reg::GPR:
case Reg::FPR:
case Reg::VF:
case Reg::COP0:
case Reg::VI:
assert(num < 32);
break;
case Reg::PCR:
assert(num < 2);
break;
default:
assert(false);
}
}
/*!
* Convert to string. The register must be valid.
*/
const char* Register::to_charp() const {
switch (get_kind()) {
case Reg::GPR:
return gpr_to_charp(get_gpr());
case Reg::FPR:
return fpr_to_charp(get_fpr());
case Reg::VF:
return vf_to_charp(get_vf());
case Reg::VI:
return vi_to_charp(get_vi());
case Reg::COP0:
return cop0_to_charp(get_cop0());
case Reg::PCR:
return pcr_to_charp(get_pcr());
default:
assert(false);
}
}
/*!
* Convert to string. The register must be valid.
*/
std::string Register::to_string() const {
return {to_charp()};
}
/*!
* Get the register kind.
*/
Reg::RegisterKind Register::get_kind() const {
uint16_t kind = id >> 8;
assert(kind < Reg::MAX_KIND);
return (Reg::RegisterKind)kind;
}
/*!
* Get the GPR number. Must be a GPR.
*/
Reg::Gpr Register::get_gpr() const {
assert(get_kind() == Reg::GPR);
uint16_t kind = id & 0xff;
assert(kind < Reg::MAX_GPR);
return (Reg::Gpr)(kind);
}
/*!
* Get the FPR number. Must be an FPR.
*/
uint32_t Register::get_fpr() const {
assert(get_kind() == Reg::FPR);
uint16_t kind = id & 0xff;
assert(kind < 32);
return kind;
}
/*!
* Get the VF number. Must be a VF.
*/
uint32_t Register::get_vf() const {
assert(get_kind() == Reg::VF);
uint16_t kind = id & 0xff;
assert(kind < 32);
return kind;
}
/*!
* Get the VI number. Must be a VI.
*/
uint32_t Register::get_vi() const {
assert(get_kind() == Reg::VI);
uint16_t kind = id & 0xff;
assert(kind < 32);
return kind;
}
/*!
* Get the COP0 number. Must be a COP0.
*/
Reg::Cop0 Register::get_cop0() const {
assert(get_kind() == Reg::COP0);
uint16_t kind = id & 0xff;
assert(kind < Reg::MAX_COP0);
return (Reg::Cop0)(kind);
}
/*!
* Get the PCR number. Must be a PCR.
*/
uint32_t Register::get_pcr() const {
assert(get_kind() == Reg::PCR);
uint16_t kind = id & 0xff;
assert(kind < 2);
return kind;
}
bool Register::operator==(const Register& other) const {
return id == other.id;
}
bool Register::operator!=(const Register& other) const {
return id != other.id;
}

View File

@ -0,0 +1,145 @@
/*!
* @file Register.h
* Representation of an EE register.
*/
#ifndef NEXT_REGISTER_H
#define NEXT_REGISTER_H
#include <cstdint>
#include <string>
// Namespace for register name constants
namespace Reg {
enum RegisterKind {
GPR = 0, // EE General purpose registers, these have nicknames.
FPR = 1, // EE Floating point registers, just called f0 - f31
VF = 2, // VU0 Floating point vector registers from EE, just called vf0 - vf31
VI =
3, // VU0 Integer registers from EE, the first 16 are vi00 - vi15, the rest are control regs.
COP0 = 4, // EE COP0 Control Registers: full of fancy names (there are 32 of them)
PCR = 5, // Performance Counter registers (PCR0, PCR1)
MAX_KIND = 6
};
// nicknames for GPRs
enum Gpr {
R0 = 0, // hardcoded to zero
AT = 1, // temp, not used by GOAL compiler, but used by GOAL's kernel inline assembly (an other
// places?)
V0 = 2, // return, temp
V1 = 3, // temp
A0 = 4, // arg0, temp
A1 = 5, // arg1, temp
A2 = 6, // arg2, temp
A3 = 7, // arg3, temp
T0 = 8, // arg4, temp
T1 = 9, // arg5, temp
T2 = 10, // arg6, temp
T3 = 11, // arg7, temp
T4 = 12, // temp
T5 = 13, // temp
T6 = 14, // temp
T7 = 15, // temp
S0 = 16, // saved
S1 = 17, // saved
S2 = 18, // saved
S3 = 19, // saved
S4 = 20, // saved
S5 = 21, // saved
S6 = 22, // process pointer
S7 = 23, // symbol table
T8 = 24, // temp
T9 = 25, // function pointer
K0 = 26, // reserved
K1 = 27, // reserved
GP = 28, // saved (C code uses this a global pointer)
SP = 29, // stack pointer
FP = 30, // global pointer (address of current function)
RA = 31, // return address
MAX_GPR = 32
};
// nicknames for COP0 registers
enum Cop0 {
INDEX = 0,
RANDOM = 1,
ENTRYLO0 = 2,
ENTRYLO1 = 3,
CONTEXT = 4,
PAGEMASK = 5,
WIRED = 6,
INVALID7 = 7,
BADVADDR = 8,
COUNT = 9,
ENTRYHI = 10,
COMPARE = 11,
COP0_STATUS = 12,
CAUSE = 13,
EPC = 14,
PRID = 15,
CONFIG = 16,
INVALID17 = 17,
INVALID18 = 18,
INVALID19 = 19,
INVALID20 = 20,
INVALID21 = 21,
INVALID22 = 22,
BADPADDR = 23,
DEBUG = 24,
PERF = 25,
INVALID26 = 26,
INVALID27 = 27,
TAGLO = 28,
TAGHI = 29,
ERROREPC = 30,
INVALID31 = 31,
MAX_COP0 = 32
};
// nicknames for COP2 Integer (VI) registers
// the first 16 are vi0 - vi15, so they don't have nicknames
enum Vi {
COP2_STATUS = 16,
MAC = 17,
CLIPPING = 18,
COP2_INVALID3 = 19,
R = 20,
I = 21,
Q = 22,
COP2_INVALID7 = 23,
COP2_INVALID8 = 24,
COP2_INVALID9 = 25,
TPC = 26,
CMSAR0 = 27,
FBRST = 28,
VPUSTAT = 29,
COP2_INVALID14 = 30,
CMSAR1 = 31,
MAX_COP2 = 32
};
} // namespace Reg
// Representation of a register. Uses a 32-bit integer internally.
class Register {
public:
Register() = default;
Register(Reg::RegisterKind kind, uint32_t num);
const char* to_charp() const;
std::string to_string() const;
Reg::RegisterKind get_kind() const;
Reg::Gpr get_gpr() const;
uint32_t get_fpr() const;
uint32_t get_vf() const;
uint32_t get_vi() const;
Reg::Cop0 get_cop0() const;
uint32_t get_pcr() const;
bool operator==(const Register& other) const;
bool operator!=(const Register& other) const;
private:
uint16_t id = -1;
};
#endif // NEXT_REGISTER_H

View File

@ -0,0 +1,51 @@
#include <algorithm>
#include <cassert>
#include "BasicBlocks.h"
#include "decompiler/ObjectFile/LinkedObjectFile.h"
#include "decompiler/Disasm/InstructionMatching.h"
/*!
* Find all basic blocks in a function.
* All delay slot instructions are grouped with the branch instruction.
* This is done by finding all "dividers", which are after branch delay instructions and before
* branch destinations, then sorting them, ignoring duplicates, and creating the blocks.
*/
std::vector<BasicBlock> find_blocks_in_function(const LinkedObjectFile& file,
int seg,
const Function& func) {
std::vector<BasicBlock> basic_blocks;
// note - the first word of a function is the "function" type and should go in any basic block
std::vector<int> dividers = {0, int(func.instructions.size())};
for (int i = 0; i < int(func.instructions.size()); i++) {
const auto& instr = func.instructions.at(i);
const auto& instr_info = instr.get_info();
if (instr_info.is_branch || instr_info.is_branch_likely) {
// make sure the delay slot of this branch is included in the function
assert(i + func.start_word < func.end_word - 1);
// divider after delay slot
dividers.push_back(i + 2);
auto label_id = instr.get_label_target();
assert(label_id != -1);
const auto& label = file.labels.at(label_id);
// should only jump to within our own function
assert(label.target_segment == seg);
assert(label.offset / 4 > func.start_word);
assert(label.offset / 4 < func.end_word - 1);
dividers.push_back(label.offset / 4 - func.start_word);
}
}
std::sort(dividers.begin(), dividers.end());
for (size_t i = 0; i < dividers.size() - 1; i++) {
if (dividers[i] != dividers[i + 1]) {
basic_blocks.emplace_back(dividers[i], dividers[i + 1]);
assert(dividers[i] >= 0);
}
}
return basic_blocks;
}

View File

@ -0,0 +1,23 @@
#ifndef JAK_DISASSEMBLER_BASICBLOCKS_H
#define JAK_DISASSEMBLER_BASICBLOCKS_H
#include <vector>
#include <memory>
#include "CfgVtx.h"
class LinkedObjectFile;
class Function;
struct BasicBlock {
int start_word;
int end_word;
BasicBlock(int _start_word, int _end_word) : start_word(_start_word), end_word(_end_word) {}
};
std::vector<BasicBlock> find_blocks_in_function(const LinkedObjectFile& file,
int seg,
const Function& func);
#endif // JAK_DISASSEMBLER_BASICBLOCKS_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,336 @@
#ifndef JAK_DISASSEMBLER_CFGVTX_H
#define JAK_DISASSEMBLER_CFGVTX_H
#include <string>
#include <vector>
#include <cassert>
#include "decompiler/util/LispPrint.h"
/*!
* In v, find an item equal to old, and replace it with replace.
* Will throw an error is there is not exactly one thing equal to old.
*/
template <typename T>
void replace_exactly_one_in(std::vector<T>& v, T old, T replace) {
bool replaced = false;
for (auto& x : v) {
if (x == old) {
assert(!replaced);
x = replace;
replaced = true;
}
}
assert(replaced);
}
/*!
* Representation of a vertex in the control flow graph.
*
* The desired state of the control flow graph is to have a single "top-level" node, with NULL as
* its parent. This top level node can then be viewed as the entire control flow for the function.
* When the graph is fully understood, the only relation between vertices should be parent-child.
* For example, an "if_else" vertex will have a "condition" vertex, "true_case" vertex, and "false
* case" vertex as children.
*
* However, the initial state of the CFG is to have all the vertices be in the top level. When there
* are multiple top level vertices, the graph is considered to be "unresolved", as there are
* relations between these that are not explained by parent-child control structuring. These
* relations are either pred/succ, indicating program control flow, and next/prev indicating code
* layout order. These are undesirable because these do not map to high-level program structure.
*
* The graph attempts to "resolve" itself, meaning these pred/succ relations are destroyed and
* replaced with nested control flow. The pred/succ and next/prev relations should only exist at the
* top level.
*
* Once resolved, there will be a single "top level" node containing the entire control flow
* structure.
*
* All CfgVtxs should be created from the ControlFlowGraph::alloc function, which allocates them
* from a pool and cleans them up when the ControlFlowGraph is destroyed. This approach avoids
* circular reference issues from a referencing counting approach, but does mean that temporary
* allocations aren't cleaned up until the entire graph is deleted, but this is probably fine.
*
* Note - there are two special "top-level" vertices that are always present, called Entry and Exit.
* These always exist and don't count toward making the graph unresolved.
* These vertices won't be counted in the get_top_level_vertices_count.
*
* Desired end state of the graph:
* Entry -> some-top-level-control-flow-structure -> Exit
*/
class CfgVtx {
public:
virtual std::string to_string() = 0; // convert to a single line string for debugging
virtual std::shared_ptr<Form> to_form() = 0; // recursive print as LISP form.
virtual ~CfgVtx() = default;
CfgVtx* parent = nullptr; // parent structure, or nullptr if top level
CfgVtx* succ_branch = nullptr; // possible successor from branching, or NULL if no branch
CfgVtx* succ_ft = nullptr; // possible successor from falling through, or NULL if impossible
CfgVtx* next = nullptr; // next code in memory
CfgVtx* prev = nullptr; // previous code in memory
std::vector<CfgVtx*> pred; // all vertices which have us as succ_branch or succ_ft
int uid = -1;
struct {
bool has_branch = false; // does the block end in a branch (any kind)?
bool branch_likely = false; // does the block end in a likely branch?
bool branch_always = false; // does the branch always get taken?
} end_branch;
// each child class of CfgVtx will define its own children.
/*!
* Do we have s as a successor?
*/
bool has_succ(CfgVtx* s) const { return succ_branch == s || succ_ft == s; }
/*!
* Do we have p as a predecessor?
*/
bool has_pred(CfgVtx* p) const {
for (auto* x : pred) {
if (x == p)
return true;
}
return false;
}
/*!
* Lazy function for getting all non-null succesors
*/
std::vector<CfgVtx*> succs() {
std::vector<CfgVtx*> result;
if (succ_branch) {
result.push_back(succ_branch);
}
if (succ_ft && succ_ft != succ_branch) {
result.push_back(succ_ft);
}
return result;
}
void parent_claim(CfgVtx* new_parent);
void replace_pred_and_check(CfgVtx* old_pred, CfgVtx* new_pred);
void replace_succ_and_check(CfgVtx* old_succ, CfgVtx* new_succ);
void replace_preds_with_and_check(std::vector<CfgVtx*> old_preds, CfgVtx* new_pred);
std::string links_to_string();
};
/*!
* Special Entry vertex representing the beginning of the function
*/
class EntryVtx : public CfgVtx {
public:
EntryVtx() = default;
std::shared_ptr<Form> to_form() override;
std::string to_string() override;
};
/*!
* Special Exit vertex representing the end of the function
*/
class ExitVtx : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
};
/*!
* A vertex which represents a single basic block. It has no children.
*/
class BlockVtx : public CfgVtx {
public:
explicit BlockVtx(int id) : block_id(id) {}
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
int block_id = -1; // which block are we?
bool is_early_exit_block = false; // are we an empty block at the end for early exits to jump to?
};
/*!
* A vertex representing a sequence of child vertices which are always represented in order.
* Child vertices in here don't set their next/prev pred/succ pointers as this counts as resolved.
*/
class SequenceVtx : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
std::vector<CfgVtx*> seq;
};
/*!
* Representing a (cond ((a b) (c d) ... (else z))) structure.
* Note that the first condition ("a" in the above example) may "steal" instructions belonging
* to an outer scope and these may eventually need to be "unstolen"
*/
class CondWithElse : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
struct Entry {
Entry() = default;
Entry(CfgVtx* _c, CfgVtx* _b) : condition(_c), body(_b) {}
CfgVtx* condition = nullptr;
CfgVtx* body = nullptr;
};
std::vector<Entry> entries;
CfgVtx* else_vtx = nullptr;
};
/*!
* Representing a (cond ((a b) (c d) ... )) structure.
* Note that the first condition ("a" in the above example) may "steal" instructions belonging
* to an outer scope and these may eventually need to be "unstolen"
*/
class CondNoElse : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
struct Entry {
Entry() = default;
Entry(CfgVtx* _c, CfgVtx* _b) : condition(_c), body(_b) {}
CfgVtx* condition = nullptr;
CfgVtx* body = nullptr;
};
std::vector<Entry> entries;
};
class WhileLoop : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
CfgVtx* condition = nullptr;
CfgVtx* body = nullptr;
};
class UntilLoop : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
CfgVtx* condition = nullptr;
CfgVtx* body = nullptr;
};
class UntilLoop_single : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
CfgVtx* block = nullptr;
};
class ShortCircuit : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
std::vector<CfgVtx*> entries;
};
class InfiniteLoopBlock : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
CfgVtx* block;
};
class GotoEnd : public CfgVtx {
public:
std::string to_string() override;
std::shared_ptr<Form> to_form() override;
CfgVtx* body = nullptr;
CfgVtx* unreachable_block = nullptr;
};
struct BasicBlock;
/*!
* The actual CFG class, which owns all the vertices.
*/
class ControlFlowGraph {
public:
ControlFlowGraph();
~ControlFlowGraph();
std::shared_ptr<Form> to_form();
std::string to_form_string();
std::string to_dot();
int get_top_level_vertices_count();
bool is_fully_resolved();
CfgVtx* get_single_top_level();
void flag_early_exit(const std::vector<BasicBlock>& blocks);
const std::vector<BlockVtx*>& create_blocks(int count);
void link_fall_through(BlockVtx* first, BlockVtx* second);
void link_branch(BlockVtx* first, BlockVtx* second);
bool find_cond_w_else();
bool find_cond_n_else();
// bool find_if_else_top_level();
bool find_seq_top_level();
bool find_while_loop_top_level();
bool find_until_loop();
bool find_until1_loop();
bool find_short_circuits();
bool find_goto_end();
bool find_infinite_loop();
/*!
* Apply a function f to each top-level vertex.
* If f returns false, stops.
*/
template <typename Func>
void for_each_top_level_vtx(Func f) {
for (auto* x : m_node_pool) {
if (!x->parent && x != entry() && x != exit()) {
if (!f(x)) {
return;
}
}
}
}
EntryVtx* entry() { return m_entry; }
ExitVtx* exit() { return m_exit; }
/*!
* Allocate and construct a node of the specified type.
*/
template <typename T, class... Args>
T* alloc(Args&&... args) {
T* new_obj = new T(std::forward<Args>(args)...);
m_node_pool.push_back(new_obj);
new_obj->uid = m_uid++;
return new_obj;
}
private:
// bool compact_one_in_top_level();
// bool is_if_else(CfgVtx* b0, CfgVtx* b1, CfgVtx* b2, CfgVtx* b3);
bool is_sequence(CfgVtx* b0, CfgVtx* b1);
bool is_sequence_of_non_sequences(CfgVtx* b0, CfgVtx* b1);
bool is_sequence_of_sequence_and_non_sequence(CfgVtx* b0, CfgVtx* b1);
bool is_sequence_of_sequence_and_sequence(CfgVtx* b0, CfgVtx* b1);
bool is_sequence_of_non_sequence_and_sequence(CfgVtx* b0, CfgVtx* b1);
bool is_while_loop(CfgVtx* b0, CfgVtx* b1, CfgVtx* b2);
bool is_until_loop(CfgVtx* b1, CfgVtx* b2);
bool is_goto_end_and_unreachable(CfgVtx* b0, CfgVtx* b1);
std::vector<BlockVtx*> m_blocks; // all block nodes, in order.
std::vector<CfgVtx*> m_node_pool; // all nodes allocated
EntryVtx* m_entry; // the entry vertex
ExitVtx* m_exit; // the exit vertex
int m_uid = 0;
};
class LinkedObjectFile;
class Function;
std::shared_ptr<ControlFlowGraph> build_cfg(const LinkedObjectFile& file, int seg, Function& func);
#endif // JAK_DISASSEMBLER_CFGVTX_H

View File

@ -0,0 +1,552 @@
#include <cassert>
#include <vector>
#include "Function.h"
#include "decompiler/Disasm/InstructionMatching.h"
#include "decompiler/ObjectFile/LinkedObjectFile.h"
#include "decompiler/TypeSystem/TypeInfo.h"
namespace {
std::vector<Register> gpr_backups = {make_gpr(Reg::GP), make_gpr(Reg::S5), make_gpr(Reg::S4),
make_gpr(Reg::S3), make_gpr(Reg::S2), make_gpr(Reg::S1),
make_gpr(Reg::S0)};
std::vector<Register> fpr_backups = {make_fpr(30), make_fpr(28), make_fpr(26),
make_fpr(24), make_fpr(22), make_fpr(20)};
Register get_expected_gpr_backup(int n, int total) {
assert(total <= int(gpr_backups.size()));
assert(n < total);
return gpr_backups.at((total - 1) - n);
}
Register get_expected_fpr_backup(int n, int total) {
assert(total <= int(fpr_backups.size()));
assert(n < total);
return fpr_backups.at((total - 1) - n);
}
uint32_t align16(uint32_t in) {
return (in + 15) & (~15);
}
uint32_t align8(uint32_t in) {
return (in + 7) & (~7);
}
uint32_t align4(uint32_t in) {
return (in + 3) & (~3);
}
} // namespace
Function::Function(int _start_word, int _end_word) : start_word(_start_word), end_word(_end_word) {}
/*!
* Remove the function prologue from the first basic block and populate this->prologue with info.
*/
void Function::analyze_prologue(const LinkedObjectFile& file) {
int idx = 1;
// first we look for daddiu sp, sp, -x to determine how much stack is used
if (is_gpr_2_imm_int(instructions.at(idx), InstructionKind::DADDIU, make_gpr(Reg::SP),
make_gpr(Reg::SP), {})) {
prologue.total_stack_usage = -instructions.at(idx).get_imm_src_int();
idx++;
} else {
prologue.total_stack_usage = 0;
}
// don't include type tag
prologue_end = 1;
// if we use the stack, we may back up some registers onto it
if (prologue.total_stack_usage) {
// heuristics to detect asm functions
{
auto& instr = instructions.at(idx);
// storing stack pointer on the stack is done by some ASM kernel functions
if (instr.kind == InstructionKind::SW && instr.get_src(0).get_reg() == make_gpr(Reg::SP)) {
printf("[Warning] %s Suspected ASM function based on this instruction in prologue: %s\n",
guessed_name.to_string().c_str(), instr.to_string(file).c_str());
warnings += "Flagged as ASM function because of " + instr.to_string(file) + "\n";
suspected_asm = true;
return;
}
}
// ra backup is always first
if (is_no_link_gpr_store(instructions.at(idx), 8, Register(Reg::GPR, Reg::RA), {},
Register(Reg::GPR, Reg::SP))) {
prologue.ra_backed_up = true;
prologue.ra_backup_offset = get_gpr_store_offset_as_int(instructions.at(idx));
assert(prologue.ra_backup_offset == 0);
idx++;
}
{
auto& instr = instructions.at(idx);
// storing s7 on the stack is done by interrupt handlers, which we probably don't want to
// support
if (instr.kind == InstructionKind::SD && instr.get_src(0).get_reg() == make_gpr(Reg::S7)) {
printf("[Warning] %s Suspected ASM function based on this instruction in prologue: %s\n",
guessed_name.to_string().c_str(), instr.to_string(file).c_str());
warnings += "Flagged as ASM function because of " + instr.to_string(file) + "\n";
suspected_asm = true;
return;
}
}
// next is fp backup
if (is_no_link_gpr_store(instructions.at(idx), 8, Register(Reg::GPR, Reg::FP), {},
Register(Reg::GPR, Reg::SP))) {
prologue.fp_backed_up = true;
prologue.fp_backup_offset = get_gpr_store_offset_as_int(instructions.at(idx));
// in Jak 1 like we never backup fp unless ra is also backed up, so the offset is always 8.
// but it seems like it could be possible to do one without the other?
assert(prologue.fp_backup_offset == 8);
idx++;
// after backing up fp, we always set it to t9.
prologue.fp_set = is_gpr_3(instructions.at(idx), InstructionKind::OR, make_gpr(Reg::FP),
make_gpr(Reg::T9), make_gpr(Reg::R0));
assert(prologue.fp_set);
idx++;
}
// next is gpr backups. these are in reverse order, so we should first find the length
// GOAL will always do the exact same thing when the same number of gprs needs to be backed up
// so we just need to determine the number of GPR backups, and we have all the info we need
int n_gpr_backups = 0;
int gpr_idx = idx;
bool expect_nothing_after_gprs = false;
while (is_no_link_gpr_store(instructions.at(gpr_idx), 16, {}, {}, make_gpr(Reg::SP))) {
auto store_reg = instructions.at(gpr_idx).get_src(0).get_reg();
// sometimes stack memory is zeroed immediately after gpr backups, and this fools the previous
// check.
if (store_reg == make_gpr(Reg::R0)) {
printf(
"[Warning] %s Stack Zeroing Detected in Function::analyze_prologue, prologue may be "
"wrong\n",
guessed_name.to_string().c_str());
warnings += "Stack Zeroing Detected, prologue may be wrong\n";
expect_nothing_after_gprs = true;
break;
}
// this also happens a few times per game. this a0/r0 check seems to be all that's needed to
// avoid false positives here!
if (store_reg == make_gpr(Reg::A0)) {
suspected_asm = true;
printf(
"[Warning] %s Suspected ASM function because register $a0 was stored on the stack!\n",
guessed_name.to_string().c_str());
warnings += "a0 on stack detected, flagging as asm\n";
return;
}
n_gpr_backups++;
gpr_idx++;
}
if (n_gpr_backups) {
prologue.gpr_backup_offset = get_gpr_store_offset_as_int(instructions.at(idx));
for (int i = 0; i < n_gpr_backups; i++) {
int this_offset = get_gpr_store_offset_as_int(instructions.at(idx + i));
auto this_reg = instructions.at(idx + i).get_src(0).get_reg();
assert(this_offset == prologue.gpr_backup_offset + 16 * i);
if (this_reg != get_expected_gpr_backup(i, n_gpr_backups)) {
suspected_asm = true;
printf("[Warning] %s Suspected asm function that isn't flagged due to stack store %s\n",
guessed_name.to_string().c_str(),
instructions.at(idx + i).to_string(file).c_str());
warnings += "Suspected asm function due to stack store: " +
instructions.at(idx + i).to_string(file) + "\n";
return;
}
}
}
prologue.n_gpr_backup = n_gpr_backups;
idx = gpr_idx;
int n_fpr_backups = 0;
int fpr_idx = idx;
if (!expect_nothing_after_gprs) {
// FPR backups
while (is_no_ll_fpr_store(instructions.at(fpr_idx), {}, {}, make_gpr(Reg::SP))) {
// auto store_reg = instructions.at(gpr_idx).get_src(0).get_reg();
n_fpr_backups++;
fpr_idx++;
}
if (n_fpr_backups) {
prologue.fpr_backup_offset = instructions.at(idx).get_src(1).get_imm();
for (int i = 0; i < n_fpr_backups; i++) {
int this_offset = instructions.at(idx + i).get_src(1).get_imm();
auto this_reg = instructions.at(idx + i).get_src(0).get_reg();
assert(this_offset == prologue.fpr_backup_offset + 4 * i);
if (this_reg != get_expected_fpr_backup(i, n_fpr_backups)) {
suspected_asm = true;
printf("[Warning] %s Suspected asm function that isn't flagged due to stack store %s\n",
guessed_name.to_string().c_str(),
instructions.at(idx + i).to_string(file).c_str());
warnings += "Suspected asm function due to stack store: " +
instructions.at(idx + i).to_string(file) + "\n";
return;
}
}
}
}
prologue.n_fpr_backup = n_fpr_backups;
idx = fpr_idx;
prologue_start = 1;
prologue_end = idx;
prologue.stack_var_offset = 0;
if (prologue.ra_backed_up) {
prologue.stack_var_offset = 8;
}
if (prologue.fp_backed_up) {
prologue.stack_var_offset = 16;
}
if (n_gpr_backups == 0 && n_fpr_backups == 0) {
prologue.n_stack_var_bytes = prologue.total_stack_usage - prologue.stack_var_offset;
} else if (n_gpr_backups == 0) {
// fprs only
prologue.n_stack_var_bytes = prologue.fpr_backup_offset - prologue.stack_var_offset;
} else if (n_fpr_backups == 0) {
// gprs only
prologue.n_stack_var_bytes = prologue.gpr_backup_offset - prologue.stack_var_offset;
} else {
// both, use gprs
assert(prologue.fpr_backup_offset > prologue.gpr_backup_offset);
prologue.n_stack_var_bytes = prologue.gpr_backup_offset - prologue.stack_var_offset;
}
assert(prologue.n_stack_var_bytes >= 0);
// check that the stack lines up by going in order
// RA backup
int total_stack = 0;
if (prologue.ra_backed_up) {
total_stack = align8(total_stack);
assert(prologue.ra_backup_offset == total_stack);
total_stack += 8;
}
if (!prologue.ra_backed_up && prologue.fp_backed_up) {
// GOAL does this for an unknown reason.
total_stack += 8;
}
// FP backup
if (prologue.fp_backed_up) {
total_stack = align8(total_stack);
assert(prologue.fp_backup_offset == total_stack);
total_stack += 8;
assert(prologue.fp_set);
}
// Stack Variables
if (prologue.n_stack_var_bytes) {
// no alignment because we don't know how the stack vars are aligned.
// stack var padding counts toward this section.
assert(prologue.stack_var_offset == total_stack);
total_stack += prologue.n_stack_var_bytes;
}
// GPRS
if (prologue.n_gpr_backup) {
total_stack = align16(total_stack);
assert(prologue.gpr_backup_offset == total_stack);
total_stack += 16 * prologue.n_gpr_backup;
}
// FPRS
if (prologue.n_fpr_backup) {
total_stack = align4(total_stack);
assert(prologue.fpr_backup_offset == total_stack);
total_stack += 4 * prologue.n_fpr_backup;
}
total_stack = align16(total_stack);
// End!
assert(prologue.total_stack_usage == total_stack);
}
// it's fine to have the entire first basic block be the prologue - you could loop back to the
// first instruction past the prologue.
assert(basic_blocks.at(0).end_word >= prologue_end);
basic_blocks.at(0).start_word = prologue_end;
prologue.decoded = true;
check_epilogue(file);
}
/*!
* Print info about the prologue and stack.
*/
std::string Function::Prologue::to_string(int indent) const {
char buff[512];
char* buff_ptr = buff;
std::string indent_str(indent, ' ');
if (!decoded) {
return indent_str + ";BAD PROLOGUE";
}
buff_ptr += sprintf(buff_ptr, "%s;stack: total 0x%02x, fp? %d ra? %d ep? %d", indent_str.c_str(),
total_stack_usage, fp_set, ra_backed_up, epilogue_ok);
if (n_stack_var_bytes) {
buff_ptr += sprintf(buff_ptr, "\n%s;stack_vars: %d bytes at %d", indent_str.c_str(),
n_stack_var_bytes, stack_var_offset);
}
if (n_gpr_backup) {
buff_ptr += sprintf(buff_ptr, "\n%s;gprs:", indent_str.c_str());
for (int i = 0; i < n_gpr_backup; i++) {
buff_ptr += sprintf(buff_ptr, " %s", gpr_backups.at(i).to_string().c_str());
}
}
if (n_fpr_backup) {
buff_ptr += sprintf(buff_ptr, "\n%s;fprs:", indent_str.c_str());
for (int i = 0; i < n_fpr_backup; i++) {
buff_ptr += sprintf(buff_ptr, " %s", fpr_backups.at(i).to_string().c_str());
}
}
return {buff};
}
/*!
* Check that the epilogue matches the prologue.
*/
void Function::check_epilogue(const LinkedObjectFile& file) {
(void)file;
if (!prologue.decoded || suspected_asm) {
printf("not decoded, or suspected asm, skipping epilogue\n");
return;
}
// start at the end and move up.
int idx = int(instructions.size()) - 1;
// seek past alignment nops
while (is_nop(instructions.at(idx))) {
idx--;
}
epilogue_end = idx;
// stack restore
if (prologue.total_stack_usage) {
// hack - sometimes an asm function has a compiler inserted jr ra/daddu sp sp r0 that follows
// the "true" return. We really should have this function flagged as asm, but for now, we can
// simply skip over the compiler-generated jr ra/daddu sp sp r0.
if (is_gpr_3(instructions.at(idx), InstructionKind::DADDU, make_gpr(Reg::SP), make_gpr(Reg::SP),
make_gpr(Reg::R0))) {
idx--;
assert(is_jr_ra(instructions.at(idx)));
idx--;
printf(
"[Warning] %s Double Return Epilogue Hack! This is probably an ASM function in "
"disguise\n",
guessed_name.to_string().c_str());
warnings += "Double Return Epilogue - this is probably an ASM function\n";
}
// delay slot should be daddiu sp, sp, offset
assert(is_gpr_2_imm_int(instructions.at(idx), InstructionKind::DADDIU, make_gpr(Reg::SP),
make_gpr(Reg::SP), prologue.total_stack_usage));
idx--;
} else {
// delay slot is always daddu sp, sp, r0...
assert(is_gpr_3(instructions.at(idx), InstructionKind::DADDU, make_gpr(Reg::SP),
make_gpr(Reg::SP), make_gpr(Reg::R0)));
idx--;
}
// jr ra
assert(is_jr_ra(instructions.at(idx)));
idx--;
// restore gprs
for (int i = 0; i < prologue.n_gpr_backup; i++) {
int gpr_idx = prologue.n_gpr_backup - (1 + i);
const auto& expected_reg = gpr_backups.at(gpr_idx);
auto expected_offset = prologue.gpr_backup_offset + 16 * i;
assert(is_no_ll_gpr_load(instructions.at(idx), 16, true, expected_reg, expected_offset,
make_gpr(Reg::SP)));
idx--;
}
// restore fprs
for (int i = 0; i < prologue.n_fpr_backup; i++) {
int fpr_idx = prologue.n_fpr_backup - (1 + i);
const auto& expected_reg = fpr_backups.at(fpr_idx);
auto expected_offset = prologue.fpr_backup_offset + 4 * i;
assert(
is_no_ll_fpr_load(instructions.at(idx), expected_reg, expected_offset, make_gpr(Reg::SP)));
idx--;
}
// restore fp
if (prologue.fp_backed_up) {
assert(is_no_ll_gpr_load(instructions.at(idx), 8, true, make_gpr(Reg::FP),
prologue.fp_backup_offset, make_gpr(Reg::SP)));
idx--;
}
// restore ra
if (prologue.ra_backed_up) {
assert(is_no_ll_gpr_load(instructions.at(idx), 8, true, make_gpr(Reg::RA),
prologue.ra_backup_offset, make_gpr(Reg::SP)));
idx--;
}
assert(!basic_blocks.empty());
assert(idx + 1 >= basic_blocks.back().start_word);
basic_blocks.back().end_word = idx + 1;
prologue.epilogue_ok = true;
epilogue_start = idx + 1;
}
/*!
* Look through all blocks in this function for storing the address of a function into a symbol.
* This indicates the stored function address belongs to a global function with the same name as
* the symbol.
*
* Updates the guessed_name of the function and updates type_info
*/
void Function::find_global_function_defs(LinkedObjectFile& file) {
int state = 0;
int label_id = -1;
Register reg;
for (const auto& instr : instructions) {
// look for LUIs always
if (instr.kind == InstructionKind::LUI && instr.get_src(0).kind == InstructionAtom::LABEL) {
state = 1;
reg = instr.get_dst(0).get_reg();
label_id = instr.get_src(0).get_label();
assert(label_id != -1);
continue;
}
if (state == 1) {
// Look for ORI
if (instr.kind == InstructionKind::ORI && instr.get_src(0).get_reg() == reg &&
instr.get_src(1).get_label() == label_id) {
state = 2;
reg = instr.get_dst(0).get_reg();
continue;
} else {
state = 0;
}
}
if (state == 2) {
// Look for SW
if (instr.kind == InstructionKind::SW && instr.get_src(0).get_reg() == reg &&
instr.get_src(2).get_reg() == make_gpr(Reg::S7)) {
// done!
std::string name = instr.get_src(1).get_sym();
if (!file.label_points_to_code(label_id)) {
// printf("discard as not code: %s\n", name.c_str());
} else {
auto& func = file.get_function_at_label(label_id);
assert(func.guessed_name.empty());
func.guessed_name.set_as_global(name);
get_type_info().inform_symbol(name, TypeSpec("function"));
// todo - inform function.
}
} else {
state = 0;
}
}
}
}
/*!
* Look through this function to find calls to method-set! which define methods.
* Updates the guessed_name of the function and updates type_info.
*/
void Function::find_method_defs(LinkedObjectFile& file) {
int state = 0;
int label_id = -1;
int method_id = -1;
Register lui_reg;
std::string type_name;
for (const auto& instr : instructions) {
// look for lw t9, method-set!(s7)
if (instr.kind == InstructionKind::LW && instr.get_dst(0).get_reg() == make_gpr(Reg::T9) &&
instr.get_src(0).kind == InstructionAtom::IMM_SYM &&
instr.get_src(0).get_sym() == "method-set!" &&
instr.get_src(1).get_reg() == make_gpr(Reg::S7)) {
state = 1;
continue;
}
if (state == 1) {
// look for lw a0, type-name(s7)
if (instr.kind == InstructionKind::LW && instr.get_dst(0).get_reg() == make_gpr(Reg::A0) &&
instr.get_src(0).kind == InstructionAtom::IMM_SYM &&
instr.get_src(1).get_reg() == make_gpr(Reg::S7)) {
type_name = instr.get_src(0).get_sym();
state = 2;
continue;
} else {
state = 0;
}
}
if (state == 2) {
// look for addiu a1, r0, x
if (instr.kind == InstructionKind::ADDIU && instr.get_dst(0).get_reg() == make_gpr(Reg::A1) &&
instr.get_src(0).get_reg() == make_gpr(Reg::R0)) {
method_id = instr.get_src(1).get_imm();
state = 3;
continue;
} else {
state = 0;
}
}
if (state == 3) {
// look for lui
if (instr.kind == InstructionKind::LUI && instr.get_src(0).kind == InstructionAtom::LABEL) {
state = 4;
lui_reg = instr.get_dst(0).get_reg();
label_id = instr.get_src(0).get_label();
assert(label_id != -1);
continue;
} else {
state = 0;
}
}
if (state == 4) {
if (instr.kind == InstructionKind::ORI && instr.get_src(0).get_reg() == lui_reg &&
instr.get_src(1).get_label() == label_id) {
state = 5;
lui_reg = instr.get_dst(0).get_reg();
continue;
} else {
state = 0;
}
}
if (state == 5) {
if (instr.kind == InstructionKind::JALR && instr.get_dst(0).get_reg() == make_gpr(Reg::RA) &&
instr.get_src(0).get_reg() == make_gpr(Reg::T9)) {
auto& func = file.get_function_at_label(label_id);
assert(func.guessed_name.empty());
func.guessed_name.set_as_method(type_name, method_id);
state = 0;
continue;
}
}
}
}

View File

@ -0,0 +1,122 @@
#ifndef NEXT_FUNCTION_H
#define NEXT_FUNCTION_H
#include <string>
#include <vector>
#include "decompiler/Disasm/Instruction.h"
#include "BasicBlocks.h"
#include "CfgVtx.h"
struct FunctionName {
enum class FunctionKind {
UNIDENTIFIED, // hasn't been identified yet.
GLOBAL, // global named function
METHOD,
TOP_LEVEL_INIT,
} kind = FunctionKind::UNIDENTIFIED;
std::string function_name; // only applicable for GLOBAL
std::string type_name; // only applicable for METHOD
int method_id = -1; // only applicable for METHOD
std::string to_string() const {
switch(kind) {
case FunctionKind::GLOBAL:
return function_name;
case FunctionKind::METHOD:
return "(method " + std::to_string(method_id) + " " + type_name + ")";
case FunctionKind::TOP_LEVEL_INIT:
return "(top-level-login)";
case FunctionKind::UNIDENTIFIED:
return "(?)";
default:
assert(false);
}
}
bool empty() const {
return kind == FunctionKind::UNIDENTIFIED;
}
void set_as_top_level() {
kind = FunctionKind::TOP_LEVEL_INIT;
}
void set_as_global(std::string name) {
kind = FunctionKind::GLOBAL;
function_name = std::move(name);
}
void set_as_method(std::string tn, int id) {
kind = FunctionKind::METHOD;
type_name = std::move(tn);
method_id = id;
}
bool expected_unique() const {
return kind == FunctionKind::GLOBAL || kind == FunctionKind::METHOD;
}
};
class Function {
public:
Function(int _start_word, int _end_word);
void analyze_prologue(const LinkedObjectFile& file);
void find_global_function_defs(LinkedObjectFile& file);
void find_method_defs(LinkedObjectFile& file);
int segment = -1;
int start_word = -1;
int end_word = -1; // not inclusive, but does include padding.
FunctionName guessed_name;
bool suspected_asm = false;
std::vector<Instruction> instructions;
std::vector<BasicBlock> basic_blocks;
std::shared_ptr<ControlFlowGraph> cfg = nullptr;
int prologue_start = -1;
int prologue_end = -1;
int epilogue_start = -1;
int epilogue_end = -1;
std::string warnings;
struct Prologue {
bool decoded = false; // have we removed the prologue from basic blocks?
int total_stack_usage = -1;
// ra/fp are treated differently from other register backups
bool ra_backed_up = false;
int ra_backup_offset = -1;
bool fp_backed_up = false;
int fp_backup_offset = -1;
bool fp_set = false;
int n_gpr_backup = 0;
int gpr_backup_offset = -1;
int n_fpr_backup = 0;
int fpr_backup_offset = -1;
int n_stack_var_bytes = 0;
int stack_var_offset = -1;
bool epilogue_ok = false;
std::string to_string(int indent = 0) const;
} prologue;
bool uses_fp_register = false;
private:
void check_epilogue(const LinkedObjectFile& file);
};
#endif // NEXT_FUNCTION_H

View File

@ -0,0 +1,853 @@
/*!
* @file LinkedObjectFile.cpp
* An object file's data with linking information included.
*/
#include "LinkedObjectFile.h"
#include <algorithm>
#include <cassert>
#include <cstring>
#include <numeric>
#include "decompiler/Disasm/InstructionDecode.h"
#include "decompiler/config.h"
/*!
* Set the number of segments in this object file.
* This can only be done once, and must be done before adding any words.
*/
void LinkedObjectFile::set_segment_count(int n_segs) {
assert(segments == 0);
segments = n_segs;
words_by_seg.resize(n_segs);
label_per_seg_by_offset.resize(n_segs);
offset_of_data_zone_by_seg.resize(n_segs);
functions_by_seg.resize(n_segs);
}
/*!
* Add a single word to the given segment.
*/
void LinkedObjectFile::push_back_word_to_segment(uint32_t word, int segment) {
words_by_seg.at(segment).emplace_back(word);
}
/*!
* Get a label ID for a label which points to the given offset in the given segment.
* Will return an existing label if one exists.
*/
int LinkedObjectFile::get_label_id_for(int seg, int offset) {
auto kv = label_per_seg_by_offset.at(seg).find(offset);
if (kv == label_per_seg_by_offset.at(seg).end()) {
// create a new label
int id = labels.size();
Label label;
label.target_segment = seg;
label.offset = offset;
label.name = "L" + std::to_string(id);
label_per_seg_by_offset.at(seg)[offset] = id;
labels.push_back(label);
return id;
} else {
// return an existing label
auto& label = labels.at(kv->second);
assert(label.offset == offset);
assert(label.target_segment == seg);
return kv->second;
}
}
/*!
* Get the ID of the label which points to the given offset in the given segment.
* Returns -1 if there is no label.
*/
int LinkedObjectFile::get_label_at(int seg, int offset) const {
auto kv = label_per_seg_by_offset.at(seg).find(offset);
if (kv == label_per_seg_by_offset.at(seg).end()) {
return -1;
}
return kv->second;
}
/*!
* Does this label point to code? Can point to the middle of a function, or the start of a function.
*/
bool LinkedObjectFile::label_points_to_code(int label_id) const {
auto& label = labels.at(label_id);
auto data_start = int(offset_of_data_zone_by_seg.at(label.target_segment)) * 4;
return label.offset < data_start;
}
/*!
* Get the function starting at this label, or error if there is none.
*/
Function& LinkedObjectFile::get_function_at_label(int label_id) {
auto& label = labels.at(label_id);
for (auto& func : functions_by_seg.at(label.target_segment)) {
// + 4 to skip past type tag to the first word, which is were the label points.
if (func.start_word * 4 + 4 == label.offset) {
return func;
}
}
assert(false);
return functions_by_seg.front().front(); // to avoid error
}
/*!
* Get the name of the label.
*/
std::string LinkedObjectFile::get_label_name(int label_id) const {
return labels.at(label_id).name;
}
/*!
* Add link information that a word is a pointer to another word.
*/
bool LinkedObjectFile::pointer_link_word(int source_segment,
int source_offset,
int dest_segment,
int dest_offset) {
assert((source_offset % 4) == 0);
auto& word = words_by_seg.at(source_segment).at(source_offset / 4);
assert(word.kind == LinkedWord::PLAIN_DATA);
if (dest_offset / 4 > (int)words_by_seg.at(dest_segment).size()) {
// printf("HACK bad link ignored!\n");
return false;
}
assert(dest_offset / 4 <= (int)words_by_seg.at(dest_segment).size());
word.kind = LinkedWord::PTR;
word.label_id = get_label_id_for(dest_segment, dest_offset);
return true;
}
/*!
* Add link information that a word is linked to a symbol/type/empty list.
*/
void LinkedObjectFile::symbol_link_word(int source_segment,
int source_offset,
const char* name,
LinkedWord::Kind kind) {
assert((source_offset % 4) == 0);
auto& word = words_by_seg.at(source_segment).at(source_offset / 4);
// assert(word.kind == LinkedWord::PLAIN_DATA);
if (word.kind != LinkedWord::PLAIN_DATA) {
printf("bad symbol link word\n");
}
word.kind = kind;
word.symbol_name = name;
}
/*!
* Add link information that a word's lower 16 bits are the offset of the given symbol relative to
* the symbol table register.
*/
void LinkedObjectFile::symbol_link_offset(int source_segment, int source_offset, const char* name) {
assert((source_offset % 4) == 0);
auto& word = words_by_seg.at(source_segment).at(source_offset / 4);
assert(word.kind == LinkedWord::PLAIN_DATA);
word.kind = LinkedWord::SYM_OFFSET;
word.symbol_name = name;
}
/*!
* Add link information that a lui/ori pair will load a pointer.
*/
void LinkedObjectFile::pointer_link_split_word(int source_segment,
int source_hi_offset,
int source_lo_offset,
int dest_segment,
int dest_offset) {
assert((source_hi_offset % 4) == 0);
assert((source_lo_offset % 4) == 0);
auto& hi_word = words_by_seg.at(source_segment).at(source_hi_offset / 4);
auto& lo_word = words_by_seg.at(source_segment).at(source_lo_offset / 4);
// assert(dest_offset / 4 <= (int)words_by_seg.at(dest_segment).size());
assert(hi_word.kind == LinkedWord::PLAIN_DATA);
assert(lo_word.kind == LinkedWord::PLAIN_DATA);
hi_word.kind = LinkedWord::HI_PTR;
hi_word.label_id = get_label_id_for(dest_segment, dest_offset);
lo_word.kind = LinkedWord::LO_PTR;
lo_word.label_id = hi_word.label_id;
}
/*!
* Rename the labels so they are named L1, L2, ..., in the order of the addresses that they refer
* to. Will clear any custom label names.
*/
uint32_t LinkedObjectFile::set_ordered_label_names() {
std::vector<int> indices(labels.size());
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(), [&](int a, int b) {
auto& la = labels.at(a);
auto& lb = labels.at(b);
if (la.target_segment == lb.target_segment) {
return la.offset < lb.offset;
}
return la.target_segment < lb.target_segment;
});
for (size_t i = 0; i < indices.size(); i++) {
auto& label = labels.at(indices[i]);
label.name = "L" + std::to_string(i + 1);
}
return labels.size();
}
static const char* segment_names[] = {"main segment", "debug segment", "top-level segment"};
/*!
* Print all the words, with link information and labels.
*/
std::string LinkedObjectFile::print_words() {
std::string result;
assert(segments <= 3);
for (int seg = segments; seg-- > 0;) {
// segment header
result += ";------------------------------------------\n; ";
result += segment_names[seg];
result += "\n;------------------------------------------\n";
// print each word in the segment
for (size_t i = 0; i < words_by_seg.at(seg).size(); i++) {
for (int j = 0; j < 4; j++) {
auto label_id = get_label_at(seg, i * 4 + j);
if (label_id != -1) {
result += labels.at(label_id).name + ":";
if (j != 0) {
result += " (offset " + std::to_string(j) + ")";
}
result += "\n";
}
}
auto& word = words_by_seg[seg][i];
append_word_to_string(result, word);
}
}
return result;
}
/*!
* Add a word's printed representation to the end of a string. Internal helper for print_words.
*/
void LinkedObjectFile::append_word_to_string(std::string& dest, const LinkedWord& word) const {
char buff[128];
switch (word.kind) {
case LinkedWord::PLAIN_DATA:
sprintf(buff, " .word 0x%x\n", word.data);
break;
case LinkedWord::PTR:
sprintf(buff, " .word %s\n", labels.at(word.label_id).name.c_str());
break;
case LinkedWord::SYM_PTR:
sprintf(buff, " .symbol %s\n", word.symbol_name.c_str());
break;
case LinkedWord::TYPE_PTR:
sprintf(buff, " .type %s\n", word.symbol_name.c_str());
break;
case LinkedWord::EMPTY_PTR:
sprintf(buff, " .empty-list\n"); // ?
break;
case LinkedWord::HI_PTR:
sprintf(buff, " .ptr-hi 0x%x %s\n", word.data >> 16,
labels.at(word.label_id).name.c_str());
break;
case LinkedWord::LO_PTR:
sprintf(buff, " .ptr-lo 0x%x %s\n", word.data >> 16,
labels.at(word.label_id).name.c_str());
break;
case LinkedWord::SYM_OFFSET:
sprintf(buff, " .sym-off 0x%x %s\n", word.data >> 16, word.symbol_name.c_str());
break;
default:
throw std::runtime_error("nyi");
}
dest += buff;
}
/*!
* For each segment, determine where the data area starts. Before the data area is the code area.
*/
void LinkedObjectFile::find_code() {
if (segments == 1) {
// single segment object files should never have any code.
auto& seg = words_by_seg.front();
for (auto& word : seg) {
if (!word.symbol_name.empty()) {
assert(word.symbol_name != "function");
}
}
offset_of_data_zone_by_seg.at(0) = 0;
stats.data_bytes = words_by_seg.front().size() * 4;
stats.code_bytes = 0;
} else if (segments == 3) {
// V3 object files will have all the functions, then all the static data. So to find the
// divider, we look for the last "function" tag, then find the last jr $ra instruction after
// that (plus one for delay slot) and assume that after that is data. Additionally, we check to
// make sure that there are no "function" type tags in the data section, although this is
// redundant.
for (int i = 0; i < segments; i++) {
// try to find the last reference to "function":
bool found_function = false;
size_t function_loc = -1;
for (size_t j = words_by_seg.at(i).size(); j-- > 0;) {
auto& word = words_by_seg.at(i).at(j);
if (word.kind == LinkedWord::TYPE_PTR && word.symbol_name == "function") {
function_loc = j;
found_function = true;
break;
}
}
if (found_function) {
// look forward until we find "jr ra"
const uint32_t jr_ra = 0x3e00008;
bool found_jr_ra = false;
size_t jr_ra_loc = -1;
for (size_t j = function_loc; j < words_by_seg.at(i).size(); j++) {
auto& word = words_by_seg.at(i).at(j);
if (word.kind == LinkedWord::PLAIN_DATA && word.data == jr_ra) {
found_jr_ra = true;
jr_ra_loc = j;
}
}
assert(found_jr_ra);
assert(jr_ra_loc + 1 < words_by_seg.at(i).size());
offset_of_data_zone_by_seg.at(i) = jr_ra_loc + 2;
} else {
// no functions
offset_of_data_zone_by_seg.at(i) = 0;
}
// add label for debug purposes
if (offset_of_data_zone_by_seg.at(i) < words_by_seg.at(i).size()) {
auto data_label_id = get_label_id_for(i, 4 * (offset_of_data_zone_by_seg.at(i)));
labels.at(data_label_id).name = "L-data-start";
}
// verify there are no functions after the data section starts
for (size_t j = offset_of_data_zone_by_seg.at(i); j < words_by_seg.at(i).size(); j++) {
auto& word = words_by_seg.at(i).at(j);
if (word.kind == LinkedWord::TYPE_PTR && word.symbol_name == "function") {
assert(false);
}
}
// sizes:
stats.data_bytes += 4 * (words_by_seg.at(i).size() - offset_of_data_zone_by_seg.at(i)) * 4;
stats.code_bytes += 4 * offset_of_data_zone_by_seg.at(i);
}
} else {
// for files which we couldn't extract link data yet, they will have 0 segments and its ok.
assert(segments == 0);
}
}
/*!
* Find all the functions in each segment.
*/
void LinkedObjectFile::find_functions() {
if (segments == 1) {
// it's a v2 file, shouldn't have any functions
assert(offset_of_data_zone_by_seg.at(0) == 0);
} else {
// we assume functions don't have any data in between them, so we use the "function" type tag to
// mark the end of the previous function and the start of the next. This means that some
// functions will have a few 0x0 words after then for padding (GOAL functions are aligned), but
// this is something that the disassembler should handle.
for (int seg = 0; seg < segments; seg++) {
// start at the end and work backward...
int function_end = offset_of_data_zone_by_seg.at(seg);
while (function_end > 0) {
// back up until we find function type tag
int function_tag_loc = function_end;
bool found_function_tag_loc = false;
for (; function_tag_loc-- > 0;) {
auto& word = words_by_seg.at(seg).at(function_tag_loc);
if (word.kind == LinkedWord::TYPE_PTR && word.symbol_name == "function") {
found_function_tag_loc = true;
break;
}
}
// mark this as a function, and try again from the current function start
assert(found_function_tag_loc);
stats.function_count++;
functions_by_seg.at(seg).emplace_back(function_tag_loc, function_end);
function_end = function_tag_loc;
}
std::reverse(functions_by_seg.at(seg).begin(), functions_by_seg.at(seg).end());
}
}
}
/*!
* Run the disassembler on all functions.
*/
void LinkedObjectFile::disassemble_functions() {
for (int seg = 0; seg < segments; seg++) {
for (auto& function : functions_by_seg.at(seg)) {
for (auto word = function.start_word; word < function.end_word; word++) {
// decode!
function.instructions.push_back(
decode_instruction(words_by_seg.at(seg).at(word), *this, seg, word));
if (function.instructions.back().is_valid()) {
stats.decoded_ops++;
}
}
}
}
}
/*!
* Analyze disassembly for use of the FP register, and add labels for fp-relative data access
*/
void LinkedObjectFile::process_fp_relative_links() {
for (int seg = 0; seg < segments; seg++) {
for (auto& function : functions_by_seg.at(seg)) {
for (size_t instr_idx = 0; instr_idx < function.instructions.size(); instr_idx++) {
// we possibly need to look at three instructions
auto& instr = function.instructions[instr_idx];
auto* prev_instr = (instr_idx > 0) ? &function.instructions[instr_idx - 1] : nullptr;
auto* pprev_instr = (instr_idx > 1) ? &function.instructions[instr_idx - 2] : nullptr;
// ignore storing FP onto the stack
if ((instr.kind == InstructionKind::SD || instr.kind == InstructionKind::SQ) &&
instr.get_src(0).get_reg() == Register(Reg::GPR, Reg::FP)) {
continue;
}
// HACKs
if (instr.kind == InstructionKind::PEXTLW) {
continue;
}
// search over instruction sources
for (int i = 0; i < instr.n_src; i++) {
auto& src = instr.src[i];
if (src.kind == InstructionAtom::REGISTER // must be reg
&& src.get_reg().get_kind() == Reg::GPR // gpr
&& src.get_reg().get_gpr() == Reg::FP) { // fp reg.
stats.n_fp_reg_use++;
// offset of fp at this instruction.
int current_fp = 4 * (function.start_word + 1);
function.uses_fp_register = true;
switch (instr.kind) {
// fp-relative load
case InstructionKind::LW:
case InstructionKind::LWC1:
case InstructionKind::LD:
// generate pointer to fp-relative data
case InstructionKind::DADDIU: {
auto& atom = instr.get_imm_src();
atom.set_label(get_label_id_for(seg, current_fp + atom.get_imm()));
stats.n_fp_reg_use_resolved++;
} break;
// in the case that addiu doesn't have enough range (+/- 2^15), GOAL has two
// strategies: 1). use ori + daddu (ori doesn't sign extend, so this lets us go +2^16,
// -0) 2). use lui + ori + daddu (can reach anywhere in the address space) It seems
// that addu is used to get pointers to floating point values and daddu is used in
// other cases. Also, the position of the fp register is swapped between the two.
case InstructionKind::DADDU:
case InstructionKind::ADDU: {
assert(prev_instr);
assert(prev_instr->kind == InstructionKind::ORI);
int offset_reg_src_id = instr.kind == InstructionKind::DADDU ? 0 : 1;
auto offset_reg = instr.get_src(offset_reg_src_id).get_reg();
assert(offset_reg == prev_instr->get_dst(0).get_reg());
assert(offset_reg == prev_instr->get_src(0).get_reg());
auto& atom = prev_instr->get_imm_src();
int additional_offset = 0;
if (pprev_instr && pprev_instr->kind == InstructionKind::LUI) {
assert(pprev_instr->get_dst(0).get_reg() == offset_reg);
additional_offset = (1 << 16) * pprev_instr->get_imm_src().get_imm();
}
atom.set_label(
get_label_id_for(seg, current_fp + atom.get_imm() + additional_offset));
stats.n_fp_reg_use_resolved++;
} break;
default:
printf("unknown fp using op: %s\n", instr.to_string(*this).c_str());
assert(false);
}
}
}
}
}
}
}
/*!
* Print disassembled functions and data segments.
*/
std::string LinkedObjectFile::print_disassembly() {
bool write_hex = get_config().write_hex_near_instructions;
std::string result;
assert(segments <= 3);
for (int seg = segments; seg-- > 0;) {
// segment header
result += ";------------------------------------------\n; ";
result += segment_names[seg];
result += "\n;------------------------------------------\n\n";
// functions
for (auto& func : functions_by_seg.at(seg)) {
result += ";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n";
result += "; .function " + func.guessed_name.to_string() + "\n";
result += ";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n";
result += func.prologue.to_string(2) + "\n";
if(!func.warnings.empty()) {
result += "Warnings: " + func.warnings + "\n";
}
// print each instruction in the function.
bool in_delay_slot = false;
for (int i = 1; i < func.end_word - func.start_word; i++) {
auto label_id = get_label_at(seg, (func.start_word + i) * 4);
if (label_id != -1) {
result += labels.at(label_id).name + ":\n";
}
for (int j = 1; j < 4; j++) {
// assert(get_label_at(seg, (func.start_word + i)*4 + j) == -1);
if (get_label_at(seg, (func.start_word + i) * 4 + j) != -1) {
result += "BAD OFFSET LABEL: ";
result += labels.at(get_label_at(seg, (func.start_word + i) * 4 + j)).name + "\n";
assert(false);
}
}
auto& instr = func.instructions.at(i);
std::string line = " " + instr.to_string(*this);
if (write_hex) {
if (line.length() < 60) {
line.append(60 - line.length(), ' ');
}
result += line;
result += " ;;";
auto& word = words_by_seg[seg].at(func.start_word + i);
append_word_to_string(result, word);
} else {
result += line + "\n";
}
if (in_delay_slot) {
result += "\n";
in_delay_slot = false;
}
if (gOpcodeInfo[(int)instr.kind].has_delay_slot) {
in_delay_slot = true;
}
}
result += "\n";
//
// int bid = 0;
// for(auto& bblock : func.basic_blocks) {
// result += "BLOCK " + std::to_string(bid++)+ "\n";
// for(int i = bblock.start_word; i < bblock.end_word; i++) {
// if(i >= 0 && i < func.instructions.size()) {
// result += func.instructions.at(i).to_string(*this) + "\n";
// } else {
// result += "BAD BBLOCK INSTR ID " + std::to_string(i);
// }
// }
// }
// hack
if(func.cfg && !func.cfg->is_fully_resolved()) {
result += func.cfg->to_dot();
result += "\n";
}
if(func.cfg) {
result += func.cfg->to_form_string() + "\n";
// To debug block stuff.
/*
int bid = 0;
for(auto& block : func.basic_blocks) {
in_delay_slot = false;
result += "B" + std::to_string(bid++) + "\n";
for(auto i = block.start_word; i < block.end_word; i++) {
auto label_id = get_label_at(seg, (func.start_word + i) * 4);
if (label_id != -1) {
result += labels.at(label_id).name + ":\n";
}
auto& instr = func.instructions.at(i);
result += " " + instr.to_string(*this) + "\n";
if (in_delay_slot) {
result += "\n";
in_delay_slot = false;
}
if (gOpcodeInfo[(int)instr.kind].has_delay_slot) {
in_delay_slot = true;
}
}
}
*/
}
result += "\n\n\n";
}
// print data
for (size_t i = offset_of_data_zone_by_seg.at(seg); i < words_by_seg.at(seg).size(); i++) {
for (int j = 0; j < 4; j++) {
auto label_id = get_label_at(seg, i * 4 + j);
if (label_id != -1) {
result += labels.at(label_id).name + ":";
if (j != 0) {
result += " (offset " + std::to_string(j) + ")";
}
result += "\n";
}
}
auto& word = words_by_seg[seg][i];
append_word_to_string(result, word);
if (word.kind == LinkedWord::TYPE_PTR && word.symbol_name == "string") {
result += "; " + get_goal_string(seg, i) + "\n";
}
}
}
return result;
}
/*!
* Hacky way to get a GOAL string object
*/
std::string LinkedObjectFile::get_goal_string(int seg, int word_idx) {
std::string result = "\"";
// next should be the size
if (word_idx + 1 >= int(words_by_seg[seg].size())) {
return "invalid string!\n";
}
LinkedWord& size_word = words_by_seg[seg].at(word_idx + 1);
if (size_word.kind != LinkedWord::PLAIN_DATA) {
// sometimes an array of string pointer triggers this!
return "invalid string!\n";
}
// result += "(size " + std::to_string(size_word.data) + "): ";
// now characters...
for (size_t i = 0; i < size_word.data; i++) {
int word_offset = word_idx + 2 + (i / 4);
int byte_offset = i % 4;
auto& word = words_by_seg[seg].at(word_offset);
if (word.kind != LinkedWord::PLAIN_DATA) {
return "invalid string! (check me!)\n";
}
char cword[4];
memcpy(cword, &word.data, 4);
result += cword[byte_offset];
}
return result + "\"";
}
/*!
* Return true if the object file contains any functions at all.
*/
bool LinkedObjectFile::has_any_functions() {
for (auto& fv : functions_by_seg) {
if (!fv.empty())
return true;
}
return false;
}
/*!
* Print all scripts in this file.
*/
std::string LinkedObjectFile::print_scripts() {
std::string result;
for (int seg = 0; seg < segments; seg++) {
std::vector<bool> already_printed(words_by_seg[seg].size(), false);
// the linked list layout algorithm of GOAL puts the first pair first.
// so we want to go in forward order to catch the beginning correctly
for (size_t word_idx = 0; word_idx < words_by_seg[seg].size(); word_idx++) {
// don't print parts of scripts we've already seen
// (note that scripts could share contents, which is supported, this is just for starting
// off a script print)
if (already_printed[word_idx])
continue;
// check for linked list by looking for anything that accesses this as a pair (offset of 2)
auto label_id = get_label_at(seg, 4 * word_idx + 2);
if (label_id != -1) {
auto& label = labels.at(label_id);
if ((label.offset & 7) == 2) {
result += to_form_script(seg, word_idx, already_printed)->toStringPretty(0, 100) + "\n";
}
}
}
}
return result;
}
/*!
* Is the object pointed to the empty list?
*/
bool LinkedObjectFile::is_empty_list(int seg, int byte_idx) {
assert((byte_idx % 4) == 0);
auto& word = words_by_seg.at(seg).at(byte_idx / 4);
return word.kind == LinkedWord::EMPTY_PTR;
}
/*!
* Convert a linked list to a Form for easy printing.
* Note : this takes the address of the car of the pair. which is perhaps a bit confusing
* (in GOAL, this would be (&-> obj car))
*/
std::shared_ptr<Form> LinkedObjectFile::to_form_script(int seg,
int word_idx,
std::vector<bool>& seen) {
// the object to currently print. to start off, create pair from the car address we've been given.
int goal_print_obj = word_idx * 4 + 2;
// resulting form. we can't have a totally empty list (as an empty list looks like a symbol,
// so it wouldn't be flagged), so it's safe to make this a pair.
auto result = std::make_shared<Form>();
result->kind = FormKind::PAIR;
// the current pair to fill out.
auto fill = result;
// loop until we run out of things to add
for (;;) {
// check the thing to print is a a pair.
if ((goal_print_obj & 7) == 2) {
// first convert the car (again, with (&-> obj car))
fill->pair[0] = to_form_script_object(seg, goal_print_obj - 2, seen);
seen.at(goal_print_obj / 4) = true;
auto cdr_addr = goal_print_obj + 2;
if (is_empty_list(seg, cdr_addr)) {
// the list has ended!
fill->pair[1] = gSymbolTable.getEmptyPair();
return result;
} else {
// cdr object should be aligned.
assert((cdr_addr % 4) == 0);
auto& cdr_word = words_by_seg.at(seg).at(cdr_addr / 4);
// check for proper list
if (cdr_word.kind == LinkedWord::PTR && (labels.at(cdr_word.label_id).offset & 7) == 2) {
// yes, proper list. add another pair and link it in to the list.
goal_print_obj = labels.at(cdr_word.label_id).offset;
fill->pair[1] = std::make_shared<Form>();
fill->pair[1]->kind = FormKind::PAIR;
fill = fill->pair[1];
} else {
// improper list, put the last thing in and end
fill->pair[1] = to_form_script_object(seg, cdr_addr, seen);
return result;
}
}
} else {
// improper list, should be impossible to get here because of earlier checks
assert(false);
}
}
return result;
}
/*!
* Is the thing pointed to a string?
*/
bool LinkedObjectFile::is_string(int seg, int byte_idx) {
if (byte_idx % 4) {
return false; // must be aligned pointer.
}
int type_tag_ptr = byte_idx - 4;
// must fit in segment
if (type_tag_ptr < 0 || size_t(type_tag_ptr) >= words_by_seg.at(seg).size() * 4) {
return false;
}
auto& type_word = words_by_seg.at(seg).at(type_tag_ptr / 4);
return type_word.kind == LinkedWord::TYPE_PTR && type_word.symbol_name == "string";
}
/*!
* Convert a (pointer object) to some nice representation.
*/
std::shared_ptr<Form> LinkedObjectFile::to_form_script_object(int seg,
int byte_idx,
std::vector<bool>& seen) {
std::shared_ptr<Form> result;
switch (byte_idx & 7) {
case 0:
case 4: {
auto& word = words_by_seg.at(seg).at(byte_idx / 4);
if (word.kind == LinkedWord::SYM_PTR) {
// .symbol xxxx
result = toForm(word.symbol_name);
} else if (word.kind == LinkedWord::PLAIN_DATA) {
// .word xxxxx
result = toForm(std::to_string(word.data));
} else if (word.kind == LinkedWord::PTR) {
// might be a sub-list, or some other random pointer
auto offset = labels.at(word.label_id).offset;
if ((offset & 7) == 2) {
// list!
result = to_form_script(seg, offset / 4, seen);
} else {
if (is_string(seg, offset)) {
result = toForm(get_goal_string(seg, offset / 4 - 1));
} else {
// some random pointer, just print the label.
result = toForm(labels.at(word.label_id).name);
}
}
} else if (word.kind == LinkedWord::EMPTY_PTR) {
result = gSymbolTable.getEmptyPair();
} else {
std::string debug;
append_word_to_string(debug, word);
printf("don't know how to print %s\n", debug.c_str());
assert(false);
}
} break;
case 2: // bad, a pair snuck through.
default:
// pointers should be aligned!
printf("align %d\n", byte_idx & 7);
assert(false);
}
return result;
}

View File

@ -0,0 +1,131 @@
/*!
* @file LinkedObjectFile.h
* An object file's data with linking information included.
*/
#ifndef NEXT_LINKEDOBJECTFILE_H
#define NEXT_LINKEDOBJECTFILE_H
#include <cstdint>
#include <vector>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include "LinkedWord.h"
#include "decompiler/Function/Function.h"
#include "decompiler/util/LispPrint.h"
/*!
* A label to a location in this object file.
* Doesn't have to be word aligned.
*/
struct Label {
std::string name;
int target_segment;
int offset; // in bytes
};
/*!
* An object file's data with linking information included.
*/
class LinkedObjectFile {
public:
LinkedObjectFile() = default;
void set_segment_count(int n_segs);
void push_back_word_to_segment(uint32_t word, int segment);
int get_label_id_for(int seg, int offset);
int get_label_at(int seg, int offset) const;
bool label_points_to_code(int label_id) const;
bool pointer_link_word(int source_segment, int source_offset, int dest_segment, int dest_offset);
void pointer_link_split_word(int source_segment, int source_hi_offset, int source_lo_offset, int dest_segment, int dest_offset);
void symbol_link_word(int source_segment, int source_offset, const char* name, LinkedWord::Kind kind);
void symbol_link_offset(int source_segment, int source_offset, const char* name);
Function& get_function_at_label(int label_id);
std::string get_label_name(int label_id) const;
uint32_t set_ordered_label_names();
void find_code();
std::string print_words();
void find_functions();
void disassemble_functions();
void process_fp_relative_links();
std::string print_scripts();
std::string print_disassembly();
bool has_any_functions();
void append_word_to_string(std::string& dest, const LinkedWord& word) const;
struct Stats {
uint32_t total_code_bytes = 0;
uint32_t total_v2_code_bytes = 0;
uint32_t total_v2_pointers = 0;
uint32_t total_v2_pointer_seeks = 0;
uint32_t total_v2_link_bytes = 0;
uint32_t total_v2_symbol_links = 0;
uint32_t total_v2_symbol_count = 0;
uint32_t v3_code_bytes = 0;
uint32_t v3_pointers = 0;
uint32_t v3_split_pointers = 0;
uint32_t v3_word_pointers = 0;
uint32_t v3_pointer_seeks = 0;
uint32_t v3_link_bytes = 0;
uint32_t v3_symbol_count = 0;
uint32_t v3_symbol_link_offset = 0;
uint32_t v3_symbol_link_word = 0;
uint32_t data_bytes = 0;
uint32_t code_bytes = 0;
uint32_t function_count = 0;
uint32_t decoded_ops = 0;
uint32_t n_fp_reg_use = 0;
uint32_t n_fp_reg_use_resolved = 0;
void add(const Stats& other) {
total_code_bytes += other.total_code_bytes;
total_v2_code_bytes += other.total_v2_code_bytes;
total_v2_pointers += other.total_v2_pointers;
total_v2_pointer_seeks += other.total_v2_pointer_seeks;
total_v2_link_bytes += other.total_v2_link_bytes;
total_v2_symbol_links += other.total_v2_symbol_links;
total_v2_symbol_count += other.total_v2_symbol_count;
v3_code_bytes += other.v3_code_bytes;
v3_pointers += other.v3_pointers;
v3_pointer_seeks += other.v3_pointer_seeks;
v3_link_bytes += other.v3_link_bytes;
v3_word_pointers += other.v3_word_pointers;
v3_split_pointers += other.v3_split_pointers;
v3_symbol_count += other.v3_symbol_count;
v3_symbol_link_offset += other.v3_symbol_link_offset;
v3_symbol_link_word += other.v3_symbol_link_word;
data_bytes += other.data_bytes;
code_bytes += other.code_bytes;
function_count += other.function_count;
decoded_ops += other.decoded_ops;
n_fp_reg_use += other.n_fp_reg_use;
n_fp_reg_use_resolved += other.n_fp_reg_use_resolved;
}
} stats;
int segments = 0;
std::vector<std::vector<LinkedWord>> words_by_seg;
std::vector<uint32_t> offset_of_data_zone_by_seg;
std::vector<std::vector<Function>> functions_by_seg;
std::vector<Label> labels;
private:
std::shared_ptr<Form> to_form_script(int seg, int word_idx, std::vector<bool>& seen);
std::shared_ptr<Form> to_form_script_object(int seg, int byte_idx, std::vector<bool> &seen);
bool is_empty_list(int seg, int byte_idx);
bool is_string(int seg, int byte_idx);
std::string get_goal_string(int seg, int word_idx);
std::vector<std::unordered_map<int, int>> label_per_seg_by_offset;
};
#endif //NEXT_LINKEDOBJECTFILE_H

View File

@ -0,0 +1,797 @@
/*!
* @file LinkedObjectFileCreation.cpp
* Create a LinkedObjectFile from raw object file data.
* This implements a decoder for the GOAL linking format.
*/
#include <cassert>
#include <cstring>
#include "LinkedObjectFileCreation.h"
#include "decompiler/config.h"
#include "decompiler/TypeSystem/TypeInfo.h"
// There are three link versions:
// V2 - not really in use anymore, but V4 will resue logic from it (and the game didn't rename the
// functions) V3 - optimized for code and small stuff. Supports segments (main, debug, top-level) V4
// - optimized for data (never code) and big stuff, special optimization possible for large V4
// objects at the end of DGO.
// internally V4 is really just a V2, but with the link data coming after the object data.
// there's a V4 header at the beginning, the object data, and then a V2 header and V2 link data.
// Header for link data used for V2, V3, V4 objects. For V3/V4, this is found at the beginning of
// the object data.
struct LinkHeaderCommon {
uint32_t type_tag; // for the basic offset, is 0 or -1 depending on version
uint32_t length; // different exact meanings, but length of the link data.
uint16_t version; // what version (2, 3, 4)
};
// Header for link data used for V2 linking data
struct LinkHeaderV2 {
uint32_t type_tag; // always -1
uint32_t length; // length of link data
uint32_t version; // always 2
};
// Header for link data used for V4
struct LinkHeaderV4 {
uint32_t type_tag; // always -1
uint32_t length; // length of V2 link data found after object.
uint32_t version; // always 4
uint32_t code_size; // length of object data before link data starts
};
// Per-segment info for V3 and V5 link data
struct SegmentInfo {
uint32_t relocs; // offset of relocation table
uint32_t data; // offset of segment data
uint32_t size; // segment data size (0 if segment doesn't exist)
uint32_t magic; // always 0
};
struct LinkHeaderV3 {
uint32_t type_tag; // always 0
uint32_t length; // length of link data
uint32_t version; // always 3
uint32_t segments; // always 3
char name[64]; // name of object file
SegmentInfo segment_info[3];
};
struct LinkHeaderV5 {
uint32_t type_tag; // 0 always 0?
uint32_t length_to_get_to_code; // 4 length.. of link data?
uint16_t version; // 8
uint16_t unknown; // 10
uint32_t pad; // 12
uint32_t link_length; // 16
uint8_t n_segments; // 20
char name[59]; // 21 (really??)
SegmentInfo segment_info[3];
};
// The types of symbol links
enum class SymbolLinkKind {
EMPTY_LIST, // link to the empty list
TYPE, // link to a type
SYMBOL // link to a symbol
};
/*!
* Handle symbol links for a single symbol in a V2/V4 object file.
*/
static uint32_t c_symlink2(LinkedObjectFile& f,
const std::vector<uint8_t>& data,
uint32_t code_ptr_offset,
uint32_t link_ptr_offset,
SymbolLinkKind kind,
const char* name,
int seg_id) {
get_type_info().inform_symbol_with_no_type_info(name);
auto initial_offset = code_ptr_offset;
do {
auto table_value = data.at(link_ptr_offset);
const uint8_t* relocPtr = &data.at(link_ptr_offset);
// link table has a series of variable-length-encoded integers indicating the seek amount to hit
// each reference to the symbol. It ends when the seek is 0, and all references to this symbol
// have been patched.
uint32_t seek = table_value;
uint32_t next_reloc = link_ptr_offset + 1;
if (seek & 3) {
seek = (relocPtr[1] << 8) | table_value;
next_reloc = link_ptr_offset + 2;
if (seek & 2) {
seek = (relocPtr[2] << 16) | seek;
next_reloc = link_ptr_offset + 3;
if (seek & 1) {
seek = (relocPtr[3] << 24) | seek;
next_reloc = link_ptr_offset + 4;
}
}
}
f.stats.total_v2_symbol_links++;
link_ptr_offset = next_reloc;
code_ptr_offset += (seek & 0xfffffffc);
// the value of the code gives us more information
uint32_t code_value = *(const uint32_t*)(&data.at(code_ptr_offset));
if (code_value == 0xffffffff) {
// absolute link - replace entire word with a pointer.
LinkedWord::Kind word_kind;
switch (kind) {
case SymbolLinkKind::SYMBOL:
word_kind = LinkedWord::SYM_PTR;
break;
case SymbolLinkKind::EMPTY_LIST:
word_kind = LinkedWord::EMPTY_PTR;
break;
case SymbolLinkKind::TYPE:
get_type_info().inform_type(name);
word_kind = LinkedWord::TYPE_PTR;
break;
default:
throw std::runtime_error("unhandled SymbolLinkKind");
}
f.symbol_link_word(seg_id, code_ptr_offset - initial_offset, name, word_kind);
} else {
// offset link - replace lower 16 bits with symbol table offset.
assert((code_value & 0xffff) == 0 || (code_value & 0xffff) == 0xffff);
assert(kind == SymbolLinkKind::SYMBOL);
// assert(false); // this case does not occur in V2/V4. It does in V3.
f.symbol_link_offset(seg_id, code_ptr_offset - initial_offset, name);
}
} while (data.at(link_ptr_offset));
// seek past terminating 0.
return link_ptr_offset + 1;
}
/*!
* Handle symbol links for a single symbol in a V3 object file.
*/
static uint32_t c_symlink3(LinkedObjectFile& f,
const std::vector<uint8_t>& data,
uint32_t code_ptr,
uint32_t link_ptr,
SymbolLinkKind kind,
const char* name,
int seg) {
get_type_info().inform_symbol_with_no_type_info(name);
auto initial_offset = code_ptr;
do {
// seek, with a variable length encoding that sucks.
uint8_t c;
do {
c = data.at(link_ptr);
link_ptr++;
code_ptr += c * 4;
} while (c == 0xff);
// identical logic to symlink 2
uint32_t code_value = *(const uint32_t*)(&data.at(code_ptr));
if (code_value == 0xffffffff) {
f.stats.v3_symbol_link_word++;
LinkedWord::Kind word_kind;
switch (kind) {
case SymbolLinkKind::SYMBOL:
word_kind = LinkedWord::SYM_PTR;
break;
case SymbolLinkKind::EMPTY_LIST:
word_kind = LinkedWord::EMPTY_PTR;
break;
case SymbolLinkKind::TYPE:
get_type_info().inform_type(name);
word_kind = LinkedWord::TYPE_PTR;
break;
default:
throw std::runtime_error("unhandled SymbolLinkKind");
}
f.symbol_link_word(seg, code_ptr - initial_offset, name, word_kind);
} else {
f.stats.v3_symbol_link_offset++;
assert(kind == SymbolLinkKind::SYMBOL);
f.symbol_link_offset(seg, code_ptr - initial_offset, name);
}
} while (data.at(link_ptr));
return link_ptr + 1;
}
static uint32_t align64(uint32_t in) {
return (in + 63) & (~63);
}
static uint32_t align16(uint32_t in) {
return (in + 15) & (~15);
}
/*!
* Process link data for a "V4" object file.
* In reality a V4 seems to be just a V2 object, but with the link data after the real data.
* There's a V4 header at the very beginning, but another V2 header/link data at the end
* -----------------------------------------------
* | V4 header | data | V2 header | V2 link data |
* -----------------------------------------------
*/
static void link_v4(LinkedObjectFile& f,
const std::vector<uint8_t>& data,
const std::string& name) {
// read the V4 header to find where the link data really is
const auto* header = (const LinkHeaderV4*)&data.at(0);
uint32_t link_data_offset = header->code_size + sizeof(LinkHeaderV4); // no basic offset
// code starts immediately after the header
uint32_t code_offset = sizeof(LinkHeaderV4);
uint32_t code_size = header->code_size;
f.stats.total_code_bytes += code_size;
f.stats.total_v2_code_bytes += code_size;
// add all code
const uint8_t* code_start = &data.at(code_offset);
const uint8_t* code_end =
&data.at(code_offset + code_size); // safe because link data is after code.
assert(((code_end - code_start) % 4) == 0);
f.set_segment_count(1);
for (auto x = code_start; x < code_end; x += 4) {
f.push_back_word_to_segment(*((const uint32_t*)x), 0);
}
// read v2 header after the code
const uint8_t* link_data = &data.at(link_data_offset);
const auto* link_header_v2 = (const LinkHeaderV2*)(link_data); // subtract off type tag
assert(link_header_v2->type_tag == 0xffffffff);
assert(link_header_v2->version == 2);
assert(link_header_v2->length == header->length);
f.stats.total_v2_link_bytes += link_header_v2->length;
uint32_t link_ptr_offset = link_data_offset + sizeof(LinkHeaderV2);
// first "section" of link data is a list of where all the pointer are.
if (data.at(link_ptr_offset) == 0) {
// there are no pointers.
link_ptr_offset++;
} else {
// there are pointers.
// there are a series of variable-length coded integers, indicating where the pointers are, in
// the form: seek_amount, number_of_consecutive_pointers, seek_amount,
// number_of_consecutive_pointers, ... , 0
uint32_t code_ptr_offset = code_offset;
bool fixing = false; // either seeking or fixing
while (true) { // loop over entire table
while (true) { // loop over current mode (fixing/seeking)
// get count from table
auto count = data.at(link_ptr_offset);
link_ptr_offset++;
if (!fixing) {
// then we are seeking
code_ptr_offset += 4 * count;
f.stats.total_v2_pointer_seeks++;
} else {
// then we are fixing consecutive pointers
for (uint8_t i = 0; i < count; i++) {
if (!f.pointer_link_word(0, code_ptr_offset - code_offset, 0,
*((const uint32_t*)(&data.at(code_ptr_offset))))) {
printf("WARNING bad link in %s\n", name.c_str());
}
f.stats.total_v2_pointers++;
code_ptr_offset += 4;
}
}
// check if we are done with the current integer
if (count != 0xff)
break;
// when we "end" an encoded integer on an 0xff, we need an explicit zero byte to change
// modes. this handles this special case.
if (data.at(link_ptr_offset) == 0) {
link_ptr_offset++;
fixing = !fixing;
}
}
// mode ended, switch
fixing = !fixing;
// we got a zero, that means we're done with pointer fixing.
if (data.at(link_ptr_offset) == 0)
break;
}
link_ptr_offset++;
}
// second "section" of link data is a list of symbols to fix up.
if (data.at(link_ptr_offset) == 0) {
// no symbols
} else {
while (true) {
uint32_t reloc = data.at(link_ptr_offset);
link_ptr_offset++;
const char* s_name;
SymbolLinkKind kind;
if ((reloc & 0x80) == 0) {
// it's a symbol
if (reloc > 9) {
// always happens.
link_ptr_offset--;
} else {
assert(false);
}
s_name = (const char*)(&data.at(link_ptr_offset));
kind = SymbolLinkKind::SYMBOL;
} else {
// it's a type
kind = SymbolLinkKind::TYPE;
uint8_t method_count = reloc & 0x7f;
s_name = (const char*)(&data.at(link_ptr_offset));
if (method_count == 0) {
method_count = 1;
// hack which will add 44 methods to _newly created_ types
// I assume the thing generating V2 objects didn't know about method counts.
// so this was a "safe" backup - if linking a V2 object requires allocating a type.
// just be on the safe side.
// (see the !symbolValue case in intern_type_from_c)
} else {
assert(false);
}
}
if (std::string("_empty_") == s_name) {
assert(kind == SymbolLinkKind::SYMBOL);
kind = SymbolLinkKind::EMPTY_LIST;
}
link_ptr_offset += strlen(s_name) + 1;
f.stats.total_v2_symbol_count++;
link_ptr_offset = c_symlink2(f, data, code_offset, link_ptr_offset, kind, s_name, 0);
if (data.at(link_ptr_offset) == 0)
break;
}
}
// check length
assert(link_header_v2->length == align64(link_ptr_offset - link_data_offset + 1));
while (link_ptr_offset < data.size()) {
assert(data.at(link_ptr_offset) == 0);
link_ptr_offset++;
}
}
static void assert_string_empty_after(const char* str, int size) {
auto ptr = str;
while (*ptr)
ptr++;
while (ptr - str < size) {
assert(!*ptr);
ptr++;
}
}
static void link_v5(LinkedObjectFile& f,
const std::vector<uint8_t>& data,
const std::string& name) {
auto header = (const LinkHeaderV5*)(&data.at(0));
if (header->n_segments == 1) {
printf("abandon %s!\n", name.c_str());
return;
}
assert(header->type_tag == 0);
assert(name == header->name);
assert(header->n_segments == 3);
assert(header->pad == 0x50);
assert(header->length_to_get_to_code - header->link_length == 0x50);
f.set_segment_count(3);
// link v3's data size is data.size() - link_length
// link v5's data size is data.size() - new_link_length - 0x50.
// lbp + 4 points to version?
// lbp points to 4 past start of header.
// lbp[1] = version + unknown 16 bit thing.
// lbp[3] = link block length (minus 0x50)
// todo - check this against the code size we actually got.
// size_t expected_code_size = data.size() - (header->link_length + 0x50);
uint32_t data_ptr_offset = header->length_to_get_to_code;
uint32_t segment_data_offsets[3];
uint32_t segment_link_offsets[3];
uint32_t segment_link_ends[3];
for (int i = 0; i < 3; i++) {
segment_data_offsets[i] = data_ptr_offset + header->segment_info[i].data;
segment_link_offsets[i] = header->segment_info[i].relocs + 0x50;
assert(header->segment_info[i].magic == 1);
}
// check that the data region is filled
for (int i = 0; i < 2; i++) {
assert(align16(segment_data_offsets[i] + header->segment_info[i].size) ==
segment_data_offsets[i + 1]);
}
assert(align16(segment_data_offsets[2] + header->segment_info[2].size) == data.size());
// loop over segments (reverse order for now)
for (int seg_id = 3; seg_id-- > 0;) {
// ?? is this right?
if (header->segment_info[seg_id].size == 0)
continue;
auto segment_size = header->segment_info[seg_id].size;
f.stats.v3_code_bytes += segment_size;
// if(gGameVersion == JAK2) {
bool adjusted = false;
while (segment_size % 4) {
segment_size++;
adjusted = true;
}
if (adjusted) {
printf(
"Adjusted the size of segment %d in %s, this is fine, but rare (and may indicate a "
"bigger problem if it happens often)\n",
seg_id, name.c_str());
}
// }
auto base_ptr = segment_data_offsets[seg_id];
auto data_ptr = base_ptr - 4;
auto link_ptr = segment_link_offsets[seg_id];
assert((data_ptr % 4) == 0);
assert((segment_size % 4) == 0);
auto code_start = (const uint32_t*)(&data.at(data_ptr + 4));
auto code_end = ((const uint32_t*)(&data.at(data_ptr + segment_size))) + 1;
for (auto x = code_start; x < code_end; x++) {
f.push_back_word_to_segment(*((const uint32_t*)x), seg_id);
}
bool fixing = false;
if (data.at(link_ptr)) {
// we have pointers
while (true) {
while (true) {
if (!fixing) {
// seeking
data_ptr += 4 * data.at(link_ptr);
f.stats.v3_pointer_seeks++;
} else {
// fixing.
for (uint32_t i = 0; i < data.at(link_ptr); i++) {
f.stats.v3_pointers++;
uint32_t old_code = *(const uint32_t*)(&data.at(data_ptr));
if ((old_code >> 24) == 0) {
f.stats.v3_word_pointers++;
if (!f.pointer_link_word(seg_id, data_ptr - base_ptr, seg_id, old_code)) {
printf("WARNING bad pointer_link_word (2) in %s\n", name.c_str());
}
} else {
f.stats.v3_split_pointers++;
auto dest_seg = (old_code >> 8) & 0xf;
auto lo_hi_offset = (old_code >> 12) & 0xf;
assert(lo_hi_offset);
assert(dest_seg < 3);
auto offset_upper = old_code & 0xff;
// assert(offset_upper == 0);
uint32_t low_code = *(const uint32_t*)(&data.at(data_ptr + 4 * lo_hi_offset));
uint32_t offset = low_code & 0xffff;
if (offset_upper) {
// seems to work fine, no need to warn.
// printf("WARNING - offset upper is set in %s\n", name.c_str());
offset += (offset_upper << 16);
}
f.pointer_link_split_word(seg_id, data_ptr - base_ptr,
data_ptr + 4 * lo_hi_offset - base_ptr, dest_seg, offset);
}
data_ptr += 4;
}
}
if (data.at(link_ptr) != 0xff)
break;
link_ptr++;
if (data.at(link_ptr) == 0) {
link_ptr++;
fixing = !fixing;
}
}
link_ptr++;
fixing = !fixing;
if (data.at(link_ptr) == 0)
break;
}
}
link_ptr++;
if (data.at(link_ptr)) {
auto sub_link_ptr = link_ptr;
while (true) {
auto reloc = data.at(sub_link_ptr);
auto next_link_ptr = sub_link_ptr + 1;
link_ptr = next_link_ptr;
if ((reloc & 0x80) == 0) {
link_ptr = sub_link_ptr + 3; //
const char* sname = (const char*)(&data.at(link_ptr));
link_ptr += strlen(sname) + 1;
// todo segment data offsets...
if (std::string("_empty_") == sname) {
link_ptr = c_symlink2(f, data, segment_data_offsets[seg_id], link_ptr,
SymbolLinkKind::EMPTY_LIST, sname, seg_id);
} else {
link_ptr = c_symlink2(f, data, segment_data_offsets[seg_id], link_ptr,
SymbolLinkKind::SYMBOL, sname, seg_id);
}
} else if ((reloc & 0x3f) == 0x3f) {
assert(false); // todo, does this ever get hit?
} else {
int n_methods_base = reloc & 0x3f;
int n_methods = n_methods_base * 4;
if (n_methods_base) {
n_methods += 3;
}
link_ptr += 2; // ghidra misses some aliasing here and would have you think this is +1!
const char* sname = (const char*)(&data.at(link_ptr));
link_ptr += strlen(sname) + 1;
link_ptr = c_symlink2(f, data, segment_data_offsets[seg_id], link_ptr,
SymbolLinkKind::TYPE, sname, seg_id);
}
sub_link_ptr = link_ptr;
if (!data.at(sub_link_ptr))
break;
}
}
segment_link_ends[seg_id] = link_ptr;
}
assert(segment_link_offsets[0] == 128);
if (header->segment_info[0].size) {
assert(segment_link_ends[0] + 1 == segment_link_offsets[1]);
} else {
assert(segment_link_offsets[0] + 2 == segment_link_offsets[1]);
}
if (header->segment_info[1].size) {
assert(segment_link_ends[1] + 1 == segment_link_offsets[2]);
} else {
assert(segment_link_offsets[1] + 2 == segment_link_offsets[2]);
}
assert(align16(segment_link_ends[2] + 2) == segment_data_offsets[0]);
}
static void link_v3(LinkedObjectFile& f,
const std::vector<uint8_t>& data,
const std::string& name) {
auto header = (const LinkHeaderV3*)(&data.at(0));
assert(name == header->name);
assert(header->segments == 3);
f.set_segment_count(3);
assert_string_empty_after(header->name, 64);
for (int i = 0; i < 3; i++) {
assert(header->segment_info[i].magic == 0);
// printf(" [%d] %d %d %d %d\n", i, header->segment_info[i].size,
// header->segment_info[i].data, header->segment_info[i].magic,
// header->segment_info[i].relocs);
}
f.stats.v3_link_bytes += header->length;
uint32_t data_ptr_offset = header->length;
uint32_t segment_data_offsets[3];
uint32_t segment_link_offsets[3];
uint32_t segment_link_ends[3];
for (int i = 0; i < 3; i++) {
segment_data_offsets[i] = data_ptr_offset + header->segment_info[i].data;
segment_link_offsets[i] = header->segment_info[i].relocs;
}
// check that the data region is filled
for (int i = 0; i < 2; i++) {
assert(align16(segment_data_offsets[i] + header->segment_info[i].size) ==
segment_data_offsets[i + 1]);
}
assert(align16(segment_data_offsets[2] + header->segment_info[2].size) == data.size());
// todo - check link region is filled.
// loop over segments (reverse order for now)
for (int seg_id = 3; seg_id-- > 0;) {
// ?? is this right?
if (header->segment_info[seg_id].size == 0)
continue;
auto segment_size = header->segment_info[seg_id].size;
f.stats.v3_code_bytes += segment_size;
// HACK!
// why is this a thing?
// HACK!
if (get_config().game_version == 1 && name == "level-h" && seg_id == 0) {
segment_size++;
}
if (get_config().game_version == 2) {
bool adjusted = false;
while (segment_size % 4) {
segment_size++;
adjusted = true;
}
if (adjusted) {
printf(
"Adjusted the size of segment %d in %s, this is fine, but rare (and may indicate a "
"bigger problem if it happens often)\n",
seg_id, name.c_str());
}
}
auto base_ptr = segment_data_offsets[seg_id];
auto data_ptr = base_ptr - 4;
auto link_ptr = segment_link_offsets[seg_id];
assert((data_ptr % 4) == 0);
assert((segment_size % 4) == 0);
auto code_start = (const uint32_t*)(&data.at(data_ptr + 4));
auto code_end = ((const uint32_t*)(&data.at(data_ptr + segment_size))) + 1;
for (auto x = code_start; x < code_end; x++) {
f.push_back_word_to_segment(*((const uint32_t*)x), seg_id);
}
bool fixing = false;
if (data.at(link_ptr)) {
// we have pointers
while (true) {
while (true) {
if (!fixing) {
// seeking
data_ptr += 4 * data.at(link_ptr);
f.stats.v3_pointer_seeks++;
} else {
// fixing.
for (uint32_t i = 0; i < data.at(link_ptr); i++) {
f.stats.v3_pointers++;
uint32_t old_code = *(const uint32_t*)(&data.at(data_ptr));
if ((old_code >> 24) == 0) {
f.stats.v3_word_pointers++;
if (!f.pointer_link_word(seg_id, data_ptr - base_ptr, seg_id, old_code)) {
printf("WARNING bad pointer_link_word (2) in %s\n", name.c_str());
}
} else {
f.stats.v3_split_pointers++;
auto dest_seg = (old_code >> 8) & 0xf;
auto lo_hi_offset = (old_code >> 12) & 0xf;
assert(lo_hi_offset);
assert(dest_seg < 3);
auto offset_upper = old_code & 0xff;
// assert(offset_upper == 0);
uint32_t low_code = *(const uint32_t*)(&data.at(data_ptr + 4 * lo_hi_offset));
uint32_t offset = low_code & 0xffff;
if (offset_upper) {
// seems to work fine, no need to warn.
// printf("WARNING - offset upper is set in %s\n", name.c_str());
offset += (offset_upper << 16);
}
f.pointer_link_split_word(seg_id, data_ptr - base_ptr,
data_ptr + 4 * lo_hi_offset - base_ptr, dest_seg, offset);
}
data_ptr += 4;
}
}
if (data.at(link_ptr) != 0xff)
break;
link_ptr++;
if (data.at(link_ptr) == 0) {
link_ptr++;
fixing = !fixing;
}
}
link_ptr++;
fixing = !fixing;
if (data.at(link_ptr) == 0)
break;
}
}
link_ptr++;
while (data.at(link_ptr)) {
auto reloc = data.at(link_ptr);
SymbolLinkKind kind;
link_ptr++;
const char* s_name = nullptr;
if ((reloc & 0x80) == 0) {
// it's a symbol
kind = SymbolLinkKind::SYMBOL;
link_ptr--;
s_name = (const char*)(&data.at(link_ptr));
} else {
// methods todo
s_name = (const char*)(&data.at(link_ptr));
get_type_info().inform_type_method_count(s_name, reloc & 0x7f);
kind = SymbolLinkKind::TYPE;
}
if (std::string("_empty_") == s_name) {
assert(kind == SymbolLinkKind::SYMBOL);
kind = SymbolLinkKind::EMPTY_LIST;
}
link_ptr += strlen(s_name) + 1;
f.stats.v3_symbol_count++;
link_ptr = c_symlink3(f, data, base_ptr, link_ptr, kind, s_name, seg_id);
}
segment_link_ends[seg_id] = link_ptr;
}
assert(segment_link_offsets[0] == 128);
if (header->segment_info[0].size) {
assert(segment_link_ends[0] + 1 == segment_link_offsets[1]);
} else {
assert(segment_link_offsets[0] + 2 == segment_link_offsets[1]);
}
if (header->segment_info[1].size) {
assert(segment_link_ends[1] + 1 == segment_link_offsets[2]);
} else {
assert(segment_link_offsets[1] + 2 == segment_link_offsets[2]);
}
assert(align16(segment_link_ends[2] + 2) == segment_data_offsets[0]);
}
/*!
* Main function to generate LinkedObjectFiles from raw object data.
*/
LinkedObjectFile to_linked_object_file(const std::vector<uint8_t>& data, const std::string& name) {
LinkedObjectFile result;
const auto* header = (const LinkHeaderCommon*)&data.at(0);
// use appropriate linker
if (header->version == 3) {
assert(header->type_tag == 0);
link_v3(result, data, name);
} else if (header->version == 4) {
assert(header->type_tag == 0xffffffff);
link_v4(result, data, name);
} else if (header->version == 5) {
link_v5(result, data, name);
} else {
assert(false);
}
return result;
}

View File

@ -0,0 +1,14 @@
/*!
* @file LinkedObjectFileCreation.h
* Create a LinkedObjectFile from raw object file data.
* This implements a decoder for the GOAL linking format.
*/
#ifndef NEXT_LINKEDOBJECTFILECREATION_H
#define NEXT_LINKEDOBJECTFILECREATION_H
#include "LinkedObjectFile.h"
LinkedObjectFile to_linked_object_file(const std::vector<uint8_t>& data, const std::string& name);
#endif //NEXT_LINKEDOBJECTFILECREATION_H

View File

@ -0,0 +1,33 @@
/*!
* @file LinkedWord.h
* A word (4 bytes), possibly with some linking info.
*/
#ifndef JAK2_DISASSEMBLER_LINKEDWORD_H
#define JAK2_DISASSEMBLER_LINKEDWORD_H
#include <cstdint>
#include <string>
class LinkedWord {
public:
explicit LinkedWord(uint32_t _data) : data(_data) {}
enum Kind {
PLAIN_DATA, // just plain data
PTR, // pointer to a location
HI_PTR, // lower 16-bits of this data are the upper 16 bits of a pointer
LO_PTR, // lower 16-bits of this data are the lower 16 bits of a pointer
SYM_PTR, // this is a pointer to a symbol
EMPTY_PTR, // this is a pointer to the empty list
SYM_OFFSET, // this is an offset of a symbol in the symbol table
TYPE_PTR // this is a pointer to a type
} kind = PLAIN_DATA;
uint32_t data = 0;
int label_id = -1;
std::string symbol_name;
};
#endif // JAK2_DISASSEMBLER_LINKEDWORD_H

View File

@ -0,0 +1,512 @@
/*!
* @file ObjectFileDB.cpp
* A "database" of object files found in DGO files.
* Eliminates duplicate object files, and also assigns unique names to all object files
* (there may be different object files with the same name sometimes)
*/
#include "ObjectFileDB.h"
#include <algorithm>
#include <cstring>
#include <map>
#include "LinkedObjectFileCreation.h"
#include "decompiler/config.h"
#include "third-party/minilzo/minilzo.h"
#include "decompiler/util/BinaryReader.h"
#include "decompiler/util/FileIO.h"
#include "decompiler/util/Timer.h"
#include "decompiler/Function/BasicBlocks.h"
/*!
* Get a unique name for this object file.
*/
std::string ObjectFileRecord::to_unique_name() const {
return name + "-v" + std::to_string(version);
}
/*!
* Build an object file DB for the given list of DGOs.
*/
ObjectFileDB::ObjectFileDB(const std::vector<std::string>& _dgos) {
Timer timer;
printf("- Initializing ObjectFileDB...\n");
for (auto& dgo : _dgos) {
get_objs_from_dgo(dgo);
}
printf("ObjectFileDB Initialized:\n");
printf(" total dgos: %ld\n", _dgos.size());
printf(" total data: %d bytes\n", stats.total_dgo_bytes);
printf(" total objs: %d\n", stats.total_obj_files);
printf(" unique objs: %d\n", stats.unique_obj_files);
printf(" unique data: %d bytes\n", stats.unique_obj_bytes);
printf(" total %.1f ms (%.3f MB/sec, %.3f obj/sec)\n", timer.getMs(),
stats.total_dgo_bytes / ((1u << 20u) * timer.getSeconds()),
stats.total_obj_files / timer.getSeconds());
printf("\n");
}
// Header for a DGO file
struct DgoHeader {
uint32_t size;
char name[60];
};
namespace {
/*!
* Assert false if the char[] has non-null data after the null terminated string.
* Used to sanity check the sizes of strings in DGO/object file headers.
*/
void assert_string_empty_after(const char* str, int size) {
auto ptr = str;
while (*ptr)
ptr++;
while (ptr - str < size) {
assert(!*ptr);
ptr++;
}
}
} // namespace
constexpr int MAX_CHUNK_SIZE = 0x8000;
/*!
* Load the objects stored in the given DGO into the ObjectFileDB
*/
void ObjectFileDB::get_objs_from_dgo(const std::string& filename) {
auto dgo_data = read_binary_file(filename);
stats.total_dgo_bytes += dgo_data.size();
const char jak2_header[] = "oZlB";
bool is_jak2 = true;
for (int i = 0; i < 4; i++) {
if (jak2_header[i] != dgo_data[i]) {
is_jak2 = false;
}
}
if (is_jak2) {
if (lzo_init() != LZO_E_OK) {
assert(false);
}
BinaryReader compressed_reader(dgo_data);
// seek past oZlB
compressed_reader.ffwd(4);
auto decompressed_size = compressed_reader.read<uint32_t>();
std::vector<uint8_t> decompressed_data;
decompressed_data.resize(decompressed_size);
size_t output_offset = 0;
while (true) {
// seek past alignment bytes and read the next chunk size
uint32_t chunk_size = 0;
while (!chunk_size) {
chunk_size = compressed_reader.read<uint32_t>();
}
if (chunk_size < MAX_CHUNK_SIZE) {
lzo_uint bytes_written;
auto lzo_rv =
lzo1x_decompress(compressed_reader.here(), chunk_size,
decompressed_data.data() + output_offset, &bytes_written, nullptr);
assert(lzo_rv == LZO_E_OK);
compressed_reader.ffwd(chunk_size);
output_offset += bytes_written;
} else {
// nope - sometimes chunk_size is bigger than MAX, but we should still use max.
// assert(chunk_size == MAX_CHUNK_SIZE);
memcpy(decompressed_data.data() + output_offset, compressed_reader.here(), MAX_CHUNK_SIZE);
compressed_reader.ffwd(MAX_CHUNK_SIZE);
output_offset += MAX_CHUNK_SIZE;
}
if (output_offset >= decompressed_size)
break;
while (compressed_reader.get_seek() % 4) {
compressed_reader.ffwd(1);
}
}
dgo_data = decompressed_data;
}
BinaryReader reader(dgo_data);
auto header = reader.read<DgoHeader>();
auto dgo_base_name = base_name(filename);
assert(header.name == dgo_base_name);
assert_string_empty_after(header.name, 60);
// get all obj files...
for (uint32_t i = 0; i < header.size; i++) {
auto obj_header = reader.read<DgoHeader>();
assert(reader.bytes_left() >= obj_header.size);
assert_string_empty_after(obj_header.name, 60);
add_obj_from_dgo(obj_header.name, reader.here(), obj_header.size, dgo_base_name);
reader.ffwd(obj_header.size);
}
// check we're at the end
assert(0 == reader.bytes_left());
}
/*!
* Add an object file to the ObjectFileDB
*/
void ObjectFileDB::add_obj_from_dgo(const std::string& obj_name,
uint8_t* obj_data,
uint32_t obj_size,
const std::string& dgo_name) {
stats.total_obj_files++;
auto hash = crc32(obj_data, obj_size);
// first, check to see if we already got it...
for (auto& e : obj_files_by_name[obj_name]) {
if (e.data.size() == obj_size && e.record.hash == hash) {
// already got it!
e.reference_count++;
auto rec = e.record;
obj_files_by_dgo[dgo_name].push_back(rec);
return;
}
}
// nope, have to add a new one.
ObjectFileData data;
data.data.resize(obj_size);
memcpy(data.data.data(), obj_data, obj_size);
data.record.hash = hash;
data.record.name = obj_name;
if (obj_files_by_name[obj_name].empty()) {
// if this is the first time we've seen this object file name, add it in the order.
obj_file_order.push_back(obj_name);
}
data.record.version = obj_files_by_name[obj_name].size();
obj_files_by_dgo[dgo_name].push_back(data.record);
obj_files_by_name[obj_name].emplace_back(std::move(data));
stats.unique_obj_files++;
stats.unique_obj_bytes += obj_size;
}
/*!
* Generate a listing of what object files go in which dgos
*/
std::string ObjectFileDB::generate_dgo_listing() {
std::string result = ";; DGO File Listing\n\n";
std::vector<std::string> dgo_names;
for (auto& kv : obj_files_by_dgo) {
dgo_names.push_back(kv.first);
}
std::sort(dgo_names.begin(), dgo_names.end());
for (const auto& name : dgo_names) {
result += "(\"" + name + "\"\n";
for (auto& obj : obj_files_by_dgo[name]) {
result += " " + obj.name + " :version " + std::to_string(obj.version) + "\n";
}
result += " )\n\n";
}
return result;
}
/*!
* Process all of the linking data of all objects.
*/
void ObjectFileDB::process_link_data() {
printf("- Processing Link Data...\n");
Timer process_link_timer;
LinkedObjectFile::Stats combined_stats;
for_each_obj([&](ObjectFileData& obj) {
obj.linked_data = to_linked_object_file(obj.data, obj.record.name);
combined_stats.add(obj.linked_data.stats);
});
printf("Processed Link Data:\n");
printf(" code %d bytes\n", combined_stats.total_code_bytes);
printf(" v2 code %d bytes\n", combined_stats.total_v2_code_bytes);
printf(" v2 link data %d bytes\n", combined_stats.total_v2_link_bytes);
printf(" v2 pointers %d\n", combined_stats.total_v2_pointers);
printf(" v2 pointer seeks %d\n", combined_stats.total_v2_pointer_seeks);
printf(" v2 symbols %d\n", combined_stats.total_v2_symbol_count);
printf(" v2 symbol links %d\n", combined_stats.total_v2_symbol_links);
printf(" v3 code %d bytes\n", combined_stats.v3_code_bytes);
printf(" v3 link data %d bytes\n", combined_stats.v3_link_bytes);
printf(" v3 pointers %d\n", combined_stats.v3_pointers);
printf(" split %d\n", combined_stats.v3_split_pointers);
printf(" word %d\n", combined_stats.v3_word_pointers);
printf(" v3 pointer seeks %d\n", combined_stats.v3_pointer_seeks);
printf(" v3 symbols %d\n", combined_stats.v3_symbol_count);
printf(" v3 offset symbol links %d\n", combined_stats.v3_symbol_link_offset);
printf(" v3 word symbol links %d\n", combined_stats.v3_symbol_link_word);
printf(" total %.3f ms\n", process_link_timer.getMs());
printf("\n");
}
/*!
* Process all of the labels generated from linking and give them reasonable names.
*/
void ObjectFileDB::process_labels() {
printf("- Processing Labels...\n");
Timer process_label_timer;
uint32_t total = 0;
for_each_obj([&](ObjectFileData& obj) { total += obj.linked_data.set_ordered_label_names(); });
printf("Processed Labels:\n");
printf(" total %d labels\n", total);
printf(" total %.3f ms\n", process_label_timer.getMs());
printf("\n");
}
/*!
* Dump object files and their linking data to text files for debugging
*/
void ObjectFileDB::write_object_file_words(const std::string& output_dir, bool dump_v3_only) {
if (dump_v3_only) {
printf("- Writing object file dumps (v3 only)...\n");
} else {
printf("- Writing object file dumps (all)...\n");
}
Timer timer;
uint32_t total_bytes = 0, total_files = 0;
for_each_obj([&](ObjectFileData& obj) {
if (obj.linked_data.segments == 3 || !dump_v3_only) {
auto file_text = obj.linked_data.print_words();
auto file_name = combine_path(output_dir, obj.record.to_unique_name() + ".txt");
total_bytes += file_text.size();
write_text_file(file_name, file_text);
total_files++;
}
});
printf("Wrote object file dumps:\n");
printf(" total %d files\n", total_files);
printf(" total %.3f MB\n", total_bytes / ((float)(1u << 20u)));
printf(" total %.3f ms (%.3f MB/sec)\n", timer.getMs(),
total_bytes / ((1u << 20u) * timer.getSeconds()));
printf("\n");
}
/*!
* Dump disassembly for object files containing code. Data zones will also be dumped.
*/
void ObjectFileDB::write_disassembly(const std::string& output_dir,
bool disassemble_objects_without_functions) {
printf("- Writing functions...\n");
Timer timer;
uint32_t total_bytes = 0, total_files = 0;
for_each_obj([&](ObjectFileData& obj) {
if (obj.linked_data.has_any_functions() || disassemble_objects_without_functions) {
auto file_text = obj.linked_data.print_disassembly();
auto file_name = combine_path(output_dir, obj.record.to_unique_name() + ".func");
total_bytes += file_text.size();
write_text_file(file_name, file_text);
total_files++;
}
});
printf("Wrote functions dumps:\n");
printf(" total %d files\n", total_files);
printf(" total %.3f MB\n", total_bytes / ((float)(1u << 20u)));
printf(" total %.3f ms (%.3f MB/sec)\n", timer.getMs(),
total_bytes / ((1u << 20u) * timer.getSeconds()));
printf("\n");
}
/*!
* Find code/data zones, identify functions, and disassemble
*/
void ObjectFileDB::find_code() {
printf("- Finding code in object files...\n");
LinkedObjectFile::Stats combined_stats;
Timer timer;
for_each_obj([&](ObjectFileData& obj) {
// printf("fc %s\n", obj.record.to_unique_name().c_str());
obj.linked_data.find_code();
obj.linked_data.find_functions();
obj.linked_data.disassemble_functions();
if (get_config().game_version == 1 || obj.record.to_unique_name() != "effect-control-v0") {
obj.linked_data.process_fp_relative_links();
} else {
printf("skipping process_fp_relative_links in %s\n", obj.record.to_unique_name().c_str());
}
auto& obj_stats = obj.linked_data.stats;
if (obj_stats.code_bytes / 4 > obj_stats.decoded_ops) {
printf("Failed to decode all in %s (%d / %d)\n", obj.record.to_unique_name().c_str(),
obj_stats.decoded_ops, obj_stats.code_bytes / 4);
}
combined_stats.add(obj.linked_data.stats);
});
printf("Found code:\n");
printf(" code %.3f MB\n", combined_stats.code_bytes / (float)(1 << 20));
printf(" data %.3f MB\n", combined_stats.data_bytes / (float)(1 << 20));
printf(" functions: %d\n", combined_stats.function_count);
printf(" fp uses resolved: %d / %d (%.3f %%)\n", combined_stats.n_fp_reg_use_resolved,
combined_stats.n_fp_reg_use,
100.f * (float)combined_stats.n_fp_reg_use_resolved / combined_stats.n_fp_reg_use);
auto total_ops = combined_stats.code_bytes / 4;
printf(" decoded %d / %d (%.3f %%)\n", combined_stats.decoded_ops, total_ops,
100.f * (float)combined_stats.decoded_ops / total_ops);
printf(" total %.3f ms\n", timer.getMs());
printf("\n");
}
/*!
* Finds and writes all scripts into a file named all_scripts.lisp.
* Doesn't change any state in ObjectFileDB.
*/
void ObjectFileDB::find_and_write_scripts(const std::string& output_dir) {
printf("- Finding scripts in object files...\n");
Timer timer;
std::string all_scripts;
for_each_obj([&](ObjectFileData& obj) {
auto scripts = obj.linked_data.print_scripts();
if (!scripts.empty()) {
all_scripts += ";--------------------------------------\n";
all_scripts += "; " + obj.record.to_unique_name() + "\n";
all_scripts += ";---------------------------------------\n";
all_scripts += scripts;
}
});
auto file_name = combine_path(output_dir, "all_scripts.lisp");
write_text_file(file_name, all_scripts);
printf("Found scripts:\n");
printf(" total %.3f ms\n", timer.getMs());
printf("\n");
}
void ObjectFileDB::analyze_functions() {
printf("- Analyzing Functions...\n");
Timer timer;
int total_functions = 0;
int resolved_cfg_functions = 0;
const auto& config = get_config();
{
timer.start();
for_each_obj([&](ObjectFileData& data) {
if (data.linked_data.segments == 3) {
// the top level segment should have a single function
assert(data.linked_data.functions_by_seg.at(2).size() == 1);
auto& func = data.linked_data.functions_by_seg.at(2).front();
assert(func.guessed_name.empty());
func.guessed_name.set_as_top_level();
func.find_global_function_defs(data.linked_data);
func.find_method_defs(data.linked_data);
}
});
// check for function uniqueness.
std::unordered_set<std::string> unique_names;
std::unordered_map<std::string, std::unordered_set<std::string>> duplicated_functions;
for_each_function([&](Function& func, int segment_id, ObjectFileData& data) {
(void)segment_id;
auto name = func.guessed_name.to_string();
if (func.guessed_name.expected_unique()) {
if(unique_names.find(name) != unique_names.end()) {
duplicated_functions[name].insert(data.record.to_unique_name());
}
unique_names.insert(name);
}
if (config.asm_functions_by_name.find(name) != config.asm_functions_by_name.end()) {
func.warnings += "flagged as asm by config\n";
func.suspected_asm = true;
}
});
for_each_function([&](Function& func, int segment_id, ObjectFileData& data) {
(void)segment_id;
auto name = func.guessed_name.to_string();
if(func.guessed_name.expected_unique()) {
if(duplicated_functions.find(name) != duplicated_functions.end()) {
duplicated_functions[name].insert(data.record.to_unique_name());
func.warnings += "this function exists in multiple non-identical object files";
}
}
});
// for(const auto& kv : duplicated_functions) {
// printf("Function %s is found in non-identical object files:\n", kv.first.c_str());
// for(const auto& obj : kv.second) {
// printf(" %s\n", obj.c_str());
// }
// }
}
int total_nontrivial_functions = 0;
int total_resolved_nontrivial_functions = 0;
int total_named_functions = 0;
std::map<int, std::vector<std::string>> unresolved_by_length;
if (get_config().find_basic_blocks) {
timer.start();
int total_basic_blocks = 0;
for_each_function([&](Function& func, int segment_id, ObjectFileData& data) {
auto blocks = find_blocks_in_function(data.linked_data, segment_id, func);
total_basic_blocks += blocks.size();
func.basic_blocks = blocks;
if(!func.suspected_asm) {
func.analyze_prologue(data.linked_data);
func.cfg = build_cfg(data.linked_data, segment_id, func);
total_functions++;
if (func.cfg->is_fully_resolved()) {
resolved_cfg_functions++;
}
} else {
resolved_cfg_functions++;
}
if(func.basic_blocks.size() > 1 && !func.suspected_asm) {
total_nontrivial_functions++;
if(func.cfg->is_fully_resolved()) {
total_resolved_nontrivial_functions++;
} else {
if(!func.guessed_name.empty()) {
unresolved_by_length[func.end_word - func.start_word].push_back(func.guessed_name.to_string());
}
}
}
if(!func.guessed_name.empty()) {
total_named_functions++;
}
});
printf("Found %d functions (%d with nontrivial cfgs)\n", total_functions, total_nontrivial_functions);
printf("Named %d/%d functions (%.2f%%)\n", total_named_functions, total_functions, 100.f * float(total_named_functions) / float(total_functions));
printf("Found %d basic blocks in %.3f ms\n", total_basic_blocks, timer.getMs());
printf(" %d/%d functions passed cfg analysis stage (%.2f%%)\n", resolved_cfg_functions, total_functions,
100.f * float(resolved_cfg_functions) / float(total_functions));
printf(" %d/%d nontrivial cfg's resolved (%.2f%%)\n", total_resolved_nontrivial_functions, total_nontrivial_functions,
100.f * float(total_resolved_nontrivial_functions) / float(total_nontrivial_functions));
for(auto& kv : unresolved_by_length) {
printf("LEN %d\n", kv.first);
for(auto& x : kv.second) {
printf(" %s\n", x.c_str());
}
}
}
}

View File

@ -0,0 +1,105 @@
/*!
* @file ObjectFileDB.h
* A "database" of object files found in DGO files.
* Eliminates duplicate object files, and also assigns unique names to all object files
* (there may be different object files with the same name sometimes)
*/
#ifndef JAK2_DISASSEMBLER_OBJECTFILEDB_H
#define JAK2_DISASSEMBLER_OBJECTFILEDB_H
#include <cassert>
#include <string>
#include <unordered_map>
#include <vector>
#include "LinkedObjectFile.h"
/*!
* A "record" which can be used to identify an object file.
*/
struct ObjectFileRecord {
std::string name;
int version = -1;
uint32_t hash = 0;
std::string to_unique_name() const;
};
/*!
* All of the data for a single object file
*/
struct ObjectFileData {
std::vector<uint8_t> data; // raw bytes
LinkedObjectFile linked_data; // data including linking annotations
ObjectFileRecord record; // name
uint32_t reference_count = 0; // number of times its used.
};
class ObjectFileDB {
public:
ObjectFileDB(const std::vector<std::string>& _dgos);
std::string generate_dgo_listing();
void process_link_data();
void process_labels();
void find_code();
void find_and_write_scripts(const std::string& output_dir);
void write_object_file_words(const std::string& output_dir, bool dump_v3_only);
void write_disassembly(const std::string& output_dir, bool disassemble_objects_without_functions);
void analyze_functions();
private:
void get_objs_from_dgo(const std::string& filename);
void add_obj_from_dgo(const std::string& obj_name,
uint8_t* obj_data,
uint32_t obj_size,
const std::string& dgo_name);
/*!
* Apply f to all ObjectFileData's. Does it in the right order.
*/
template <typename Func>
void for_each_obj(Func f) {
assert(obj_files_by_name.size() == obj_file_order.size());
for(const auto& name : obj_file_order) {
for(auto& obj : obj_files_by_name.at(name)) {
f(obj);
}
}
}
/*!
* Apply f to all functions
* takes (Function, segment, linked_data)
* Does it in the right order.
*/
template <typename Func>
void for_each_function(Func f) {
for_each_obj([&](ObjectFileData& data) {
// printf("IN %s\n", data.record.to_unique_name().c_str());
for (int i = 0; i < int(data.linked_data.segments); i++) {
// printf("seg %d\n", i);
int fn = 0;
for (auto& goal_func : data.linked_data.functions_by_seg.at(i)) {
// printf("fn %d\n", fn);
f(goal_func, i, data);
fn++;
}
}
});
}
// Danger: after adding all object files, we assume that the vector never reallocates.
std::unordered_map<std::string, std::vector<ObjectFileData>> obj_files_by_name;
std::unordered_map<std::string, std::vector<ObjectFileRecord>> obj_files_by_dgo;
std::vector<std::string> obj_file_order;
struct {
uint32_t total_dgo_bytes = 0;
uint32_t total_obj_files = 0;
uint32_t unique_obj_files = 0;
uint32_t unique_obj_bytes = 0;
} stats;
};
#endif // JAK2_DISASSEMBLER_OBJECTFILEDB_H

189
decompiler/README.md Normal file
View File

@ -0,0 +1,189 @@
How to use
-----------
Compile (Linux):
```
mkdir build
cd build
cmake ..
make -j
cd ..
```
After compiling:
First create a folder for the output and create a folder for the input. Add all of the CGO/DGO files into the input folder.
```
build/jak_disassembler config/jak1_ntsc_black_label.jsonc in_folder/ out_folder/
```
Notes
--------
The `config` folder has settings for the disassembly. Currently Jak 2 and Jak 3 are not as well supported as Jak 1.
# Procedure
## ObjectFileDB
The `ObjectFileDB` tracks unique object files. The games have a lot of duplicated objected files, and object files with the same names but different contents, so `ObjectFileDB` is used to create a unique name for each unique object file. It generates a file named `dgo.txt` which maps its names to the original name and which DGO files it appears in. The `ObjectFileDB` extracts all object files from a DGO file, decompressing the DGO first if needed. (note: Jak 2 demo DGOs do not decompress properly). Each object file has a number of segments, which the game can load to separate places. Sometimes there is just a single "data" segment, and other times there are three segments:
- `top-level` is executed at the end of the linking process, then discarded and goes in a special temporary heap
- `main` is loaded and linked onto the specified heap
- `debug` is loaded and linked onto the debug heap
## `ObjectFileDB::process_link_data`
This function interprets breaks the object file's data into segments, and processes the link data. The data is stored as a sequence of `LinkedObjectWord`s, which contain extra data from the link. The `LinkedObjectWord`s are stored by segment in a `LinkedObjectFile`, which also contains a list of `Label`s that allow `LinkedObjectWord`s to refer to other `LinkedObjectWord`s. Note that a `Label` can have a byte-offset into a word, which GOAL uses to load non-4-byte-aligned bytes and halfwords, and also to represent a `pair` object.
## `ObjectFileDB::find_code`
This function looks through the `LinkedObjectFile`s and splits each segment into data and code zones.
The only files with code zones are from object files with three segments, and the code always comes first. The end of the code zone is found by looking for the last GOAL `function` object, then finding the end of this object by looking one word past the last `jr ra` instruction. This assumes that the last function in each segment doesn't have an extra inline assembly `jr ra` somewhere in the middle, but functions with multiple `jr ra`'s are extremely rare (and not generated by the GOAL compiler without the use of inline assembly), so this seems like a safe assumption for now.
The code zones are scanned for GOAL `function` types, which are in front every GOAL function, and used to create `Functions`. Each `Function` is disassembled into EE Instructions, which also adds `Label`s for branch instructions, and can also contain linking data when appropriate. The final step is to look for instructions which use the `fp` register to reference static data, and insert the apprioriate `Label`s. GOAL uses the following `fp` relative addressing modes:
- `lw`, `lwc1`, `ld` relative to the `fp` register to load static data.
- `daddiu` to create a pointer to fp-relative data within +/- `2^15` bytes
- Sequence of `ori`, `daddu` to generate a pointer that reaches within `+2^16` bytes
- Sequence of `lui`, `ori`, `daddu` to generate any 32-bit offset from `fp`.
The last two are only found in very large object files, and GOALDIS doesn't handle these.
The `fp` register is set with this sequence. The function prologue only sets `fp` if it is needed in the function.
```
;; goal function call, t9 contains the function address
jalr ra, t9
sll v0, ra, 0
;; example goal function prologue:
daddiu sp, sp, -16
sd ra, 0(sp)
sd fp, 8(sp)
or fp, t9, r0
```
Note: there are a few hacks to avoid generating labels when `fp` is used as a temporary register in inline assembly. Like ignoring stores/loads of `fp` from the stack (kernel does this to suspend resume a thread), or ignoring `fp` when used with the `PEXTLW` function, or totally skipping this step for a single object file in Jak 2 (`effect-control`).
## `ObjectFileDB::process_labels`
This step simply renames labels with `L1`, `L2`, .... It should happen before any custom label naming as it will overwrite all label names.
## `ObjectFileDB::find_and_write_scripts`
Looks for static linked lists and attempts to print them. Doesn't support printing everything, but can print nested lists, strings, numbers, and symbols.
## `ObjectFileDB::write_object_file_words`
Dumps words in each segment like `hexdump`. There's an option to only run this on `v3` object files, which contain data, as opposed to `v2` which are typically large data.
## `ObjectFileDB::write_disassembly`
Like `write_object_file_words`, but code is replaced with disassembly. There's a config option to avoid running this on object files with no functions, as these are usually large data files which are uninteresting to view as a binary dump and slow to dump.
## Basic Block Finding
Look at branch intstructions and their destinations to find all basic blocks. Implemented in `find_blocks_in_function` as part of `analyze_functions`. This works for Jak 1, 2 and 3.
## Analyze Functions Prologues and Epilogues
This will help us find stack variables and make sure that the prologue/epilogue are ignored by the statement generation.
A "full" prologue looks like this:
```
daddiu sp, sp, -208
sd ra, 0(sp)
sd fp, 8(sp)
or fp, t9, r0 ;; set fp to the address of this function
sq s3, 128(sp)
sq s4, 144(sp)
sq s5, 160(sp)
sq gp, 176(sp)
swc1 f26, 192(sp)
swc1 f28, 196(sp)
swc1 f30, 200(sp)
```
GOAL will leave out instructions that aren't needed. This prologue is "decoded" into:
```
Total stack usage: 0xd0 bytes
$fp set? : yes
$ra set? : yes
Stack variables : yes, 112 bytes at sp + 16
Saved gprs: gp s5 s4 s3
Saved fprs: f30 f28 f26
```
A similar process is done for the epilogue, and it is checked against the prologue.
The prologue is removed from the first basic block and the epilogue + alignment padding is removed from the last one.
# Documentation of Planned Steps that are not implemented
Currently the focus is to get these working for Jak 1. But it shouldn't be much extra work to support Jak 2/3.
## Guess Function Names (to be implemented)
When possible, we should guess function names. It's not always possible because GOAL supports anonymous lambda functions, like for example:
```
(lambda ((x int) (y int)) (+ x y))
```
which will generate a GOAL `function` object without a name.
But these are pretty uncommon, and the majority of GOAL functions are
- Normal functions, which are stored into a `symbol` with the same name as the function
- Methods, which are stored into the method table of their `type` with the `method-set!` function. Sadly we can't get the name of methods, but we can get their ID (to figure out the inheritance hierarchy) and what type they are defined for.
- State handlers / behaviors (not yet fully understood)
- Virtual state handlers / behaviors (not yet fully understood)
Currently the state/behavior stuff isn't well understood, or used in the early initialization of the game, so name guessing won't worry about this for now.
## Guess Types (to be implemented)
The majority of GOAL types have a compiler-generated `inpsect` method which prints their fields. We should detect these methods in the previous function name guessing step, and then read through them to determine the data layout of the type.
## Control Flow Analysis
The basic blocks should be built into a graph and annotated with control flow patterns, like `if`, `cond`, `and`, and various loops. To do this, register liveliness will be determined for each instruction.
## Conversion to Statements
Instructions (or sequences of instructions that should not be separated) should be converted into `Statement`s, which represent something like `(add! r1 r2 r3)`. The registers should be mapped to variables, using as many variables as possible, as we don't know at this point if a register will be holding the same GOAL variable at different instructions.
## Type propagation
`Variable`s should get types determined by arguments of the function, which should then be propagated to other `Statement`s in the function, and can then refine the argument types of other functions. This process should be repeated until things stop changing.
## Variable declaration
Variables which are actually the same variable will be merged. The point at which variables are first defined/declared will be determined based on liveliness and then expanded to come up with a scope nesting that doesn't cross control flow boundaries.
## Statement -> S-Expression map tree
Due to the the simple single pass GOAL compiler design, we build a tree which represents how Statements can be combined to eliminate variables. As an extremely simple example:
```
(set! r1 thing1)
(set! r2 thing2)
(add-int! r4 r2 r3)
(mult-int! r1 r4)
```
can be collapsed to
```
(* thing1 (+ thing2 r3))
```
But
```
(set! r2 thing2)
(add-int! r4 r2 r3)
(set! r1 thing1)
(mult-int! r1 r4)
```
can be collapsed to
```
(let ((temp0 (+ thing2 r3)))
(+ thing1 temp0)
)
```
and this difference will actually reflect the difference in how the code was originally written! This is a huge advantage over existing decompilers, which will be unable to tell the subtle difference between the two.
## Macro pattern matching
Lots of GOAL language features are implemented with macros, so once the s-expression nesting is recovered, we can pattern match to undo macros very precisely.

View File

@ -0,0 +1 @@
#include "GoalFunction.h"

View File

@ -0,0 +1,15 @@
#ifndef JAK_DISASSEMBLER_GOALFUNCTION_H
#define JAK_DISASSEMBLER_GOALFUNCTION_H
class GoalFunction {
public:
// enum Kind {
// GLOBAL_FUNCTION,
// ANON_FUNCTION,
// METHOD,
// BEHAVIOR,
// UNKNOWN
// };
};
#endif // JAK_DISASSEMBLER_GOALFUNCTION_H

View File

@ -0,0 +1 @@
#include "GoalSymbol.h"

View File

@ -0,0 +1,38 @@
#ifndef JAK_DISASSEMBLER_GOALSYMBOL_H
#define JAK_DISASSEMBLER_GOALSYMBOL_H
#include <cassert>
#include <string>
#include "TypeSpec.h"
class GoalSymbol {
public:
GoalSymbol() = default;
explicit GoalSymbol(std::string name) : m_name(std::move(name)) {}
GoalSymbol(std::string name, TypeSpec ts) : m_name(std::move(name)), m_type(std::move(ts)) {
m_has_type_info = true;
}
bool has_type_info() const {
return m_has_type_info;
}
void set_type(TypeSpec ts) {
if(m_has_type_info) {
if(ts != m_type) {
printf("symbol %s %s -> %s", m_name.c_str(), m_type.to_string().c_str(), ts.to_string().c_str());
assert(false);
}
}
m_has_type_info = true;
m_type = std::move(ts);
}
private:
std::string m_name;
TypeSpec m_type;
bool m_has_type_info = false;
};
#endif // JAK_DISASSEMBLER_GOALSYMBOL_H

View File

@ -0,0 +1,13 @@
#include "GoalType.h"
void GoalType::set_methods(int n) {
if (m_method_count_set) {
if (m_method_count != n) {
printf("Type %s had %d methods, set_methods tried to change it to %d\n", m_name.c_str(),
m_method_count, n);
}
} else {
m_method_count = n;
m_method_count_set = true;
}
}

View File

@ -0,0 +1,27 @@
#ifndef JAK_DISASSEMBLER_GOALTYPE_H
#define JAK_DISASSEMBLER_GOALTYPE_H
#include <string>
class GoalType {
public:
GoalType() = default;
GoalType(std::string name) : m_name(std::move(name)) { }
bool has_info() const {
return m_has_info;
}
bool has_method_count() const {
return m_method_count_set;
}
void set_methods(int n);
private:
std::string m_name;
bool m_has_info = false;
bool m_method_count_set = false;
int m_method_count = -1;
};
#endif // JAK_DISASSEMBLER_GOALTYPE_H

View File

@ -0,0 +1,98 @@
#include "TypeInfo.h"
#include <utility>
namespace {
TypeInfo gTypeInfo;
}
TypeInfo::TypeInfo() {
GoalType type_type("type");
m_types["type"] = type_type;
GoalSymbol type_symbol("type");
m_symbols["type"] = type_symbol;
}
TypeInfo& get_type_info() {
return gTypeInfo;
}
std::string TypeInfo::get_summary() {
int total_symbols = 0;
int syms_with_type_info = 0;
for (const auto& kv : m_symbols) {
total_symbols++;
if (kv.second.has_type_info()) {
syms_with_type_info++;
}
}
int total_types = 0;
int types_with_info = 0;
int types_with_method_count = 0;
for (const auto& kv : m_types) {
total_types++;
if (kv.second.has_info()) {
types_with_info++;
}
if (kv.second.has_method_count()) {
types_with_method_count++;
}
}
char buffer[1024];
sprintf(buffer,
"TypeInfo Summary\n"
" Total Symbols: %d\n"
" with type info: %d (%.2f%%)\n"
" Total Types: %d\n"
" with info: %d (%.2f%%)\n"
" with method count: %d (%.2f%%)\n",
total_symbols, syms_with_type_info, 100.f * float(syms_with_type_info) / float(total_symbols),
total_types, types_with_info, 100.f * float(types_with_info) / float(total_types),
types_with_method_count, 100.f * float(types_with_method_count) / float(total_types));
return {buffer};
}
/*!
* inform TypeInfo that there is a symbol with this name.
* Provides no type info - if some is already known there is no change.
*/
void TypeInfo::inform_symbol_with_no_type_info(const std::string& name) {
if (m_symbols.find(name) == m_symbols.end()) {
// only add it if we haven't seen this already.
GoalSymbol sym(name);
m_symbols[name] = sym;
}
}
void TypeInfo::inform_symbol(const std::string &name, TypeSpec type) {
inform_symbol_with_no_type_info(name);
m_symbols.at(name).set_type(std::move(type));
}
void TypeInfo::inform_type(const std::string& name) {
if (m_types.find(name) == m_types.end()) {
GoalType typ(name);
m_types[name] = typ;
}
inform_symbol(name, TypeSpec("type"));
}
void TypeInfo::inform_type_method_count(const std::string& name, int methods) {
// create type and symbol
inform_type(name);
m_types.at(name).set_methods(methods);
}
std::string TypeInfo::get_all_symbols_debug() {
std::string result = "const char* all_syms[" + std::to_string(m_symbols.size()) + "] = {";
for(auto& x : m_symbols) {
result += "\"" + x.first + "\",";
}
if(!result.empty()) {
result.pop_back();
}
return result + "};";
}

View File

@ -0,0 +1,30 @@
#ifndef JAK_DISASSEMBLER_TYPEINFO_H
#define JAK_DISASSEMBLER_TYPEINFO_H
#include <unordered_map>
#include "GoalType.h"
#include "GoalFunction.h"
#include "GoalSymbol.h"
class TypeInfo {
public:
TypeInfo();
void inform_symbol(const std::string& name, TypeSpec type);
void inform_symbol_with_no_type_info(const std::string& name);
void inform_type(const std::string& name);
void inform_type_method_count(const std::string& name, int methods);
std::string get_summary();
std::string get_all_symbols_debug();
private:
std::unordered_map<std::string, GoalType> m_types;
std::unordered_map<std::string, GoalFunction> m_global_functions;
std::unordered_map<std::string, GoalSymbol> m_symbols;
};
TypeInfo& get_type_info();
void init_type_info();
#endif // JAK_DISASSEMBLER_TYPEINFO_H

View File

@ -0,0 +1,51 @@
#include "TypeSpec.h"
std::string TypeSpec::to_string() const {
if (m_args.empty()) {
return m_base_type;
} else {
std::string result = "(";
result += m_base_type;
for (const auto& x : m_args) {
result += " ";
result += x.to_string();
}
result += ")";
return result;
}
}
std::shared_ptr<Form> TypeSpec::to_form() const {
if (m_args.empty()) {
return toForm(m_base_type);
} else {
std::vector<std::shared_ptr<Form>> all;
all.push_back(toForm(m_base_type));
for (const auto& x : m_args) {
all.push_back(x.to_form());
}
return buildList(all);
}
}
bool TypeSpec::operator==(const TypeSpec& other) const {
if (m_base_type != other.m_base_type) {
return false;
}
if (m_args.size() != other.m_args.size()) {
return false;
}
for (size_t i = 0; i < m_args.size(); i++) {
if (m_args[i] != other.m_args[i]) {
return false;
}
}
return true;
}
bool TypeSpec::operator!=(const TypeSpec& other) const {
return !(*this == other);
}

View File

@ -0,0 +1,25 @@
#ifndef JAK_DISASSEMBLER_TYPESPEC_H
#define JAK_DISASSEMBLER_TYPESPEC_H
#include <string>
#include <vector>
#include "decompiler/util/LispPrint.h"
class TypeSpec {
public:
TypeSpec() = default;
explicit TypeSpec(std::string base_type) : m_base_type(std::move(base_type)) { }
TypeSpec(std::string base_type, std::vector<TypeSpec> args) : m_base_type(std::move(base_type)), m_args(std::move(args)) { }
std::string to_string() const;
std::shared_ptr<Form> to_form() const;
bool operator==(const TypeSpec& other) const;
bool operator!=(const TypeSpec& other) const;
private:
std::string m_base_type;
std::vector<TypeSpec> m_args;
};
#endif // JAK_DISASSEMBLER_TYPESPEC_H

32
decompiler/config.cpp Normal file
View File

@ -0,0 +1,32 @@
#include "config.h"
#include "third-party/json.hpp"
#include "util/FileIO.h"
Config gConfig;
Config& get_config() {
return gConfig;
}
void set_config(const std::string& path_to_config_file) {
auto config_str = read_text_file(path_to_config_file);
// to ignore comments in json, which may be useful
auto cfg = nlohmann::json::parse(config_str, nullptr, true, true);
gConfig.game_version = cfg.at("game_version").get<int>();
gConfig.dgo_names = cfg.at("dgo_names").get<std::vector<std::string>>();
gConfig.write_disassembly = cfg.at("write_disassembly").get<bool>();
gConfig.write_hexdump = cfg.at("write_hexdump").get<bool>();
gConfig.write_scripts = cfg.at("write_scripts").get<bool>();
gConfig.write_hexdump_on_v3_only = cfg.at("write_hexdump_on_v3_only").get<bool>();
gConfig.disassemble_objects_without_functions =
cfg.at("disassemble_objects_without_functions").get<bool>();
gConfig.find_basic_blocks = cfg.at("find_basic_blocks").get<bool>();
gConfig.write_hex_near_instructions = cfg.at("write_hex_near_instructions").get<bool>();
std::vector<std::string> asm_functions_by_name =
cfg.at("asm_functions_by_name").get<std::vector<std::string>>();
for (const auto& x : asm_functions_by_name) {
gConfig.asm_functions_by_name.insert(x);
}
}

25
decompiler/config.h Normal file
View File

@ -0,0 +1,25 @@
#ifndef JAK2_DISASSEMBLER_CONFIG_H
#define JAK2_DISASSEMBLER_CONFIG_H
#include <string>
#include <vector>
#include <unordered_set>
struct Config {
int game_version = -1;
std::vector<std::string> dgo_names;
bool write_disassembly = false;
bool write_hexdump = false;
bool write_scripts = false;
bool write_hexdump_on_v3_only = false;
bool disassemble_objects_without_functions = false;
bool find_basic_blocks = false;
bool write_hex_near_instructions = false;
std::unordered_set<std::string> asm_functions_by_name;
// ...
};
Config& get_config();
void set_config(const std::string& path_to_config_file);
#endif // JAK2_DISASSEMBLER_CONFIG_H

View File

@ -0,0 +1,37 @@
{
"game_version":1,
// the order here matters. KERNEL and GAME should go first
"dgo_names":["CGO/KERNEL.CGO"
, "CGO/GAME.CGO", "CGO/ENGINE.CGO"
, "CGO/ART.CGO", "DGO/BEA.DGO", "DGO/CIT.DGO", "CGO/COMMON.CGO", "DGO/DAR.DGO", "DGO/DEM.DGO",
"DGO/FIN.DGO", "DGO/INT.DGO", "DGO/JUB.DGO", "DGO/JUN.DGO", "CGO/JUNGLE.CGO", "CGO/L1.CGO", "DGO/FIC.DGO",
"DGO/LAV.DGO", "DGO/MAI.DGO", "CGO/MAINCAVE.CGO", "DGO/MIS.DGO", "DGO/OGR.DGO", "CGO/RACERP.CGO", "DGO/ROB.DGO", "DGO/ROL.DGO",
"DGO/SNO.DGO", "DGO/SUB.DGO", "DGO/SUN.DGO", "CGO/SUNKEN.CGO", "DGO/SWA.DGO", "DGO/TIT.DGO", "DGO/TRA.DGO", "DGO/VI1.DGO",
"DGO/VI2.DGO", "DGO/VI3.DGO", "CGO/VILLAGEP.CGO", "CGO/WATER-AN.CGO"],
"write_disassembly":true,
"write_hex_near_instructions":false,
// if false, skips disassembling object files without functions, as these are usually large and not interesting yet.
"disassemble_objects_without_functions":false,
// to write out data of each object file
"write_hexdump":false,
// to write out hexdump on the v3 only, to avoid the huge level data files
"write_hexdump_on_v3_only":true,
// to write out "scripts", which are currently just all the linked lists found
"write_scripts":false,
// Experimental Stuff
"find_basic_blocks":true,
"asm_functions_by_name":[
// gcommon
"ash", "abs", "min", "max", "collide-do-primitives", "draw-bones-check-longest-edge-asm",
"sp-launch-particles-var", "(method 15 collide-shape-prim-mesh)", "(method 15 collide-shape-prim-sphere)",
"(method 45 collide-shape)", "cam-layout-save-cam-trans", "kernel-copy-function", "dma-sync-hang", "generic-no-light-dproc", "dma-sync-fast", "bsp-camera-asm",
"generic-none-dma-wait", "unpack-comp-rle", "level-remap-texture", "(method 10 collide-edge-hold-list)"
]
}

View File

@ -0,0 +1,43 @@
{
"game_version":2,
"dgo_names":["ART.CGO", "ATE.DGO", "ATO.DGO", "CAB.DGO", "CAP.DGO", "CAS.DGO", "CASCITY.DGO", "CASEXT.DGO",
"CFA.DGO", "CFB.DGO", "CGA.DGO", "CGB.DGO", "CGC.DGO", "CIA.DGO", "CIB.DGO", "CMA.DGO",
"CMB.DGO", "COA.DGO", "COB.DGO", "COMMON.CGO", "CPA.DGO", "CPO.DGO", "CTA.DGO", "CTB.DGO",
"CTC.DGO", "CTYASHA.DGO", "CTYKORA.DGO", "CWI.DGO", "D3A.DGO", "D3B.DGO", "DEMO.DGO", "DG1.DGO",
"DMI.DGO", "DRB.DGO", "DRI.DGO", "DRILLMTN.DGO", "ENGINE.CGO", "FDA.DGO", "FDB.DGO", "FEA.DGO",
"FEB.DGO", "FOB.DGO", "FOR.DGO", "FORDUMPC.DGO", "FORDUMPD.DGO", "FRA.DGO", "FRB.DGO", "GAME.CGO",
"GARAGE.DGO", "GGA.DGO", "HALFPIPE.DGO", "HIDEOUT.DGO", "HIPHOG.DGO", "INTROCST.DGO", "KERNEL.CGO", "KIOSK.DGO",
"LASHGRD.DGO", "LASHTHRN.DGO", "LBBUSH.DGO", "LBOMBBOT.DGO", "LBRNERMK.DGO", "LCGUARD.DGO", "LCITYLOW.DGO", "LDJAKBRN.DGO",
"LERBRNGD.DGO", "LERLCHAL.DGO", "LERLTESS.DGO", "LERROL.DGO", "LGARCSTA.DGO", "LGUARD.DGO", "LHELLDOG.DGO", "LHIPOUT.DGO",
"LINTCSTB.DGO", "LJAKDAX.DGO", "LJKDXASH.DGO", "LKEIRIFT.DGO", "LKIDDOGE.DGO", "LMEETBRT.DGO", "LOUTCSTB.DGO", "LPACKAGE.DGO",
"LPORTRUN.DGO", "LPOWER.DGO", "LPROTECT.DGO", "LPRSNCST.DGO", "LPRTRACE.DGO", "LRACEBB.DGO", "LRACEBF.DGO", "LRACECB.DGO",
"LRACECF.DGO", "LRACEDB.DGO", "LRACEDF.DGO", "LRACELIT.DGO", "LSACK.DGO", "LSAMERGD.DGO", "LSHUTTLE.DGO", "LSMYSBRT.DGO",
"LTENTOB.DGO", "LTENTOUT.DGO", "LTESS.DGO", "LTHRNOUT.DGO", "LTRNKRKD.DGO", "LTRNTESS.DGO", "LTRNYSAM.DGO", "LWHACK.DGO",
"LWIDEA.DGO", "LWIDEB.DGO", "LWIDEC.DGO", "LWIDESTA.DGO", "LYSAMSAM.DGO", "LYSKDCD.DGO", "MCN.DGO", "MTN.DGO",
"MTX.DGO", "NEB.DGO", "NES.DGO", "NESTT.DGO", "ONINTENT.DGO", "ORACLE.DGO", "OUTROCST.DGO", "PAC.DGO",
"PAE.DGO", "PALBOSS.DGO", "PALOUT.DGO", "PAR.DGO", "PAS.DGO", "PORTWALL.DGO", "PRI.DGO", "RUI.DGO",
"SAG.DGO", "SEB.DGO", "SEW.DGO", "SKA.DGO", "STA.DGO", "STADBLMP.DGO", "STB.DGO", "STC.DGO",
"STD.DGO", "STR.DGO", "SWB.DGO", "SWE.DGO", "TBO.DGO", "THR.DGO", "TITLE.DGO", "TOA.DGO",
"TOB.DGO", "TOC.DGO", "TOD.DGO", "TOE.DGO", "TOMBEXT.DGO", "UNB.DGO", "UND.DGO", "VI1.DGO",
"VIN.DGO"],
// to write out disassembled functions in .func files
"write_disassembly":true,
"write_hex_near_instructions":false,
// if false, skips disassembling object files without functions, as these are usually large and not interesting yet.
"disassemble_objects_without_functions":false,
// to write out data of each object file
"write_hexdump":false,
// to write out hexdump on the v3 only, to avoid the huge level data files
"write_hexdump_on_v3_only":true,
// to write out "scripts", which are currently just all the linked lists found
"write_scripts":true,
// Experimental Stuff
"find_basic_blocks":true
}

View File

@ -0,0 +1,56 @@
{
"game_version":3,
"dgo_names":["ARENACST.DGO", "ART.CGO", "CFA.DGO", "CFB.DGO", "CGB.DGO", "CIA.DGO", "CIB.DGO", "CITYCAST.DGO",
"COMBA.DGO", "COMBB.DGO", "COMBC.DGO", "COMBD.DGO", "COMBE.DGO", "COMBN.DGO", "COMBX.DGO", "COMMON.CGO",
"CPO.DGO", "CTA.DGO", "CTB.DGO", "CTC.DGO", "CTYCARA.DGO", "CTYCARB.DGO", "CTYCARC.DGO", "CTYCARKG.DGO",
"CTYPEPA.DGO", "CTYPEPB.DGO", "CTYPEPC.DGO", "CTYPESA.DGO", "CTYPESB.DGO", "CTYPESC.DGO", "CWI.DGO", "DESA.DGO",
"DESB.DGO", "DESBATTL.DGO", "DESBCST.DGO", "DESBOSS1.DGO", "DESBOSS2.DGO", "DESC.DGO", "DESCHASE.DGO", "DESD.DGO",
"DESE.DGO", "DESERROL.DGO", "DESF.DGO", "DESG.DGO", "DESH.DGO", "DESHOVER.DGO", "DESHUNT.DGO", "DESINTER.DGO",
"DESJUMP.DGO", "DESLIZ.DGO", "DESOASIS.DGO", "DESRACE1.DGO", "DESRACE2.DGO", "DESRALLY.DGO", "DESRESC.DGO", "DESRESCC.DGO",
"DESRESCG.DGO", "DESTRACK.DGO", "DESW.DGO", "DST.DGO", "ENGINE.CGO", "FACB.DGO", "FACC.DGO", "FACD.DGO",
"FACTORYA.DGO", "FREECAST.DGO", "FREEHQ.DGO", "FRSTA.DGO", "FRSTB.DGO", "FRSTX.DGO", "GAME.CGO", "GGA.DGO",
"GRIDCST.DGO", "GUNGAME1.DGO", "GUNGAME2.DGO", "HALFPIPE.DGO", "HGA.DGO", "HGB.DGO", "HHG.DGO", "INTPALRF.DGO",
"INTROCST.DGO", "INTTITLE.DGO", "IPF.DGO", "KERNEL.CGO", "LASHELIN.DGO", "LBBRING1.DGO", "LBBRING2.DGO", "LBBRING3.DGO",
"LBBRING4.DGO", "LBBRING5.DGO", "LBBRING6.DGO", "LBBSDRP1.DGO", "LBBSDRP2.DGO", "LBBSDRP3.DGO", "LBBSPID.DGO", "LBBSPIRT.DGO",
"LBBSPRT2.DGO", "LBBSPRT3.DGO", "LBBTCHA1.DGO", "LBBTCHA2.DGO", "LBBTCHA3.DGO", "LBIPED.DGO", "LBLOWCST.DGO", "LBLOWTKG.DGO",
"LBLOWTMH.DGO", "LBOMBBOT.DGO", "LCITYSML.DGO", "LCTYASS.DGO", "LCTYBLOW.DGO", "LCTYDEST.DGO", "LCTYHIJK.DGO", "LCTYPALT.DGO",
"LCTYPATK.DGO", "LCTYPROT.DGO", "LCTYSNPR.DGO", "LDAMKLEV.DGO", "LDAMPECK.DGO", "LDAMPKSM.DGO", "LDAMSIG.DGO", "LDAX.DGO",
"LDESGCST.DGO", "LDMPCKGN.DGO", "LERROL.DGO", "LFACB.DGO", "LFACCAR.DGO", "LFACCITY.DGO", "LFACO.DGO", "LFACRM1.DGO",
"LFACRM2.DGO", "LFACTORY.DGO", "LFORM.DGO", "LFORP.DGO", "LFORRING.DGO", "LFREEOUT.DGO", "LGUNNORM.DGO", "LGUNRNC.DGO",
"LJAK.DGO", "LJAKC.DGO", "LJAKCKLV.DGO", "LJAKKLEV.DGO", "LJAKNDAX.DGO", "LJAKSIG.DGO", "LJINX.DGO", "LJKCDMKL.DGO",
"LJKDMPK.DGO", "LJKDXVIN.DGO", "LJKFEET.DGO", "LJNDKLEV.DGO", "LKEIRA.DGO", "LKLEEVER.DGO", "LMECH.DGO", "LMHCA.DGO",
"LMHCB.DGO", "LNSTCST.DGO", "LNSTOA.DGO", "LNSTOBB.DGO", "LNSTOBC.DGO", "LONINSIM.DGO", "LOUTRO.DGO", "LOUTRO2.DGO",
"LOUTRO3.DGO", "LPATK.DGO", "LPATKCS.DGO", "LPRECC.DGO", "LPRENME.DGO", "LPTRL.DGO", "LSAMOS.DGO", "LSEEMWCA.DGO",
"LSIG.DGO", "LSIGJAKC.DGO", "LSIGKLV.DGO", "LSNKWHLS.DGO", "LTNFXHIP.DGO", "LTNJXHIP.DGO", "LTORN.DGO", "LTORNJNX.DGO",
"LTORNSAM.DGO", "LTOWA.DGO", "LTOWB.DGO", "LTOWCITY.DGO", "LTRTWHLS.DGO", "LVINCST.DGO", "LWASBBV.DGO", "LWASSIG.DGO",
"LWLANDM.DGO", "LWSTDPCK.DGO", "MHCA.DGO", "MHCB.DGO", "MHCTYCST.DGO", "MIA.DGO", "MIB.DGO", "MIC.DGO",
"MINED.DGO", "MINEE.DGO", "MUSEUM.DGO", "MUSEUM2.DGO", "MUSEUM3.DGO", "MUSEUM3B.DGO", "MUSEUM4.DGO", "MUSEUM4B.DGO",
"NSA.DGO", "NSB.DGO", "OASISCST.DGO", "ONINTENT.DGO", "OUTCAST3.DGO", "OUTROCST.DGO", "POWERGD.DGO", "PRECA.DGO",
"PRECB.DGO", "PRECC.DGO", "PRECD.DGO", "RAILA.DGO", "RAILB.DGO", "RAILB2.DGO", "RAILC.DGO", "RAILCST.DGO",
"RAILD.DGO", "RAILE.DGO", "RAILF.DGO", "RAILX.DGO", "RBCT.DGO", "RUBA.DGO", "RUBA2.DGO", "RUBB.DGO",
"RUBC.DGO", "SEA.DGO", "SEB.DGO", "SEC.DGO", "SED.DGO", "SEE.DGO", "SEF.DGO", "SEG.DGO",
"SEH.DGO", "SEI.DGO", "SEJ.DGO", "SEK.DGO", "SEL.DGO", "SEM.DGO", "SEN.DGO", "SEO.DGO",
"SLUMBSET.DGO", "STA.DGO", "STAA.DGO", "STB.DGO", "TEMA.DGO", "TEMB.DGO", "TEMC.DGO", "TEMD.DGO",
"TEMP.DGO", "TEMPLEE.DGO", "TEMX.DGO", "TITLE.DGO", "TOWB.DGO", "TOWERA.DGO", "TOWERC.DGO", "TOWERCST.DGO",
"VIN.DGO", "VOCA.DGO", "VOCX.DGO", "WARPCAST.DGO", "WASALL.DGO", "WASCAST.DGO", "WASCHASE.DGO", "WASDEFEN.DGO",
"WASLEAPR.DGO", "WASPALA.DGO", "WASPGAME.DGO", "WASSEEM.DGO", "WASSTADA.DGO", "WASSTADB.DGO", "WASSTADC.DGO", "WCA.DGO",
"WCASEEM.DGO", "WCB.DGO", "WIN.DGO", "WSD.DGO", "WWD.DGO"],
// to write out disassembled functions in .func files
"write_disassembly":true,
"write_hex_near_instructions":false,
// if false, skips disassembling object files without functions, as these are usually large and not interesting yet.
"disassemble_objects_without_functions":false,
// to write out data of each object file
"write_hexdump":false,
// to write out hexdump on the v3 only, to avoid the huge level data files
"write_hexdump_on_v3_only":true,
// to write out "scripts", which are currently just all the linked lists found
"write_scripts":true,
// Experimental Stuff
"find_basic_blocks":true
}

53
decompiler/main.cpp Normal file
View File

@ -0,0 +1,53 @@
#include <cstdio>
#include <string>
#include <vector>
#include "ObjectFile/ObjectFileDB.h"
#include "config.h"
#include "util/FileIO.h"
#include "TypeSystem/TypeInfo.h"
int main(int argc, char** argv) {
printf("Jak Disassembler\n");
init_crc();
init_opcode_info();
if (argc != 4) {
printf("usage: jak_disassembler <config_file> <in_folder> <out_folder>\n");
return 1;
}
set_config(argv[1]);
std::string in_folder = argv[2];
std::string out_folder = argv[3];
std::vector<std::string> dgos;
for (const auto& dgo_name : get_config().dgo_names) {
dgos.push_back(combine_path(in_folder, dgo_name));
}
ObjectFileDB db(dgos);
write_text_file(combine_path(out_folder, "dgo.txt"), db.generate_dgo_listing());
db.process_link_data();
db.find_code();
db.process_labels();
if (get_config().write_scripts) {
db.find_and_write_scripts(out_folder);
}
if (get_config().write_hexdump) {
db.write_object_file_words(out_folder, get_config().write_hexdump_on_v3_only);
}
db.analyze_functions();
if (get_config().write_disassembly) {
db.write_disassembly(out_folder, get_config().disassemble_objects_without_functions);
}
printf("%s\n", get_type_info().get_summary().c_str());
// printf("%d\n", InstructionKind::EE_OP_MAX);
// printf("%s\n", get_type_info().get_all_symbols_debug().c_str());
return 0;
}

View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import argparse
import glob
import os
# Create a dgo_names = ["...."] json config entry text for a folder of DGOs.
def main():
parser = argparse.ArgumentParser()
parser.add_argument(dest='folder', help='folder containing dgos')
args = parser.parse_args()
files = sorted([os.path.basename(x) for x in glob.glob(os.path.join(args.folder, "*.*GO"))])
dgo_names = "\"dgo_names\":["
count = 0
for file in files:
dgo_names += "\"" + file + "\", "
count += 1
if count == 8:
count = 0
dgo_names += "\n "
dgo_names = dgo_names[:-2] # remove last ", "
dgo_names += "]\n"
print(dgo_names)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,48 @@
#ifndef JAK_V2_BINARYREADER_H
#define JAK_V2_BINARYREADER_H
#include <cstdint>
#include <cassert>
#include <vector>
class BinaryReader {
public:
BinaryReader(uint8_t* _buffer, uint32_t _size) : buffer(_buffer), size(_size) {
}
explicit BinaryReader(std::vector<uint8_t>& _buffer) : buffer((uint8_t*)_buffer.data()), size(_buffer.size()) { }
template<typename T>
T read() {
assert(seek + sizeof(T) <= size);
T& obj = *(T*)(buffer + seek);
seek += sizeof(T);
return obj;
}
void ffwd(int amount) {
seek += amount;
assert(seek <= size);
}
uint32_t bytes_left() const {
return size - seek;
}
uint8_t* here() {
return buffer + seek;
}
uint32_t get_seek() {
return seek;
}
private:
uint8_t* buffer;
uint32_t size;
uint32_t seek = 0;
};
#endif //JAK_V2_BINARYREADER_H

View File

@ -0,0 +1,82 @@
#include "FileIO.h"
#include <fstream>
#include <sstream>
#include <cassert>
std::string read_text_file(const std::string& path) {
std::ifstream file(path);
std::stringstream ss;
ss << file.rdbuf();
return ss.str();
}
std::string combine_path(const std::string& parent, const std::string& child) {
return parent + "/" + child;
}
std::vector<uint8_t> read_binary_file(const std::string& filename) {
auto fp = fopen(filename.c_str(), "rb");
if(!fp) throw std::runtime_error("File " + filename + " cannot be opened");
fseek(fp, 0, SEEK_END);
auto len = ftell(fp);
rewind(fp);
std::vector<uint8_t> data;
data.resize(len);
if(fread(data.data(), len, 1, fp) != 1) {
throw std::runtime_error("File " + filename + " cannot be read");
}
return data;
}
std::string base_name(const std::string& filename) {
size_t pos = 0;
assert(!filename.empty());
for(size_t i = filename.size() - 1; i-- > 0;) {
if(filename.at(i) == '/') {
pos = (i + 1);
break;
}
}
return filename.substr(pos);
}
static bool sInitCrc = false;
static uint32_t crc_table[0x100];
void init_crc() {
for (uint32_t i = 0; i < 0x100; i++) {
uint32_t n = i << 24u;
for (uint32_t j = 0; j < 8; j++)
n = n & 0x80000000 ? (n << 1u) ^ 0x04c11db7u : (n << 1u);
crc_table[i] = n;
}
sInitCrc = true;
}
uint32_t crc32(const uint8_t* data, size_t size) {
assert(sInitCrc);
uint32_t crc = 0;
for (size_t i = size; i != 0; i--, data++) {
crc = crc_table[crc >> 24u] ^ ((crc << 8u) | *data);
}
return ~crc;
}
uint32_t crc32(const std::vector<uint8_t>& data) {
return crc32(data.data(), data.size());
}
void write_text_file(const std::string& file_name, const std::string& text) {
FILE* fp = fopen(file_name.c_str(), "w");
if(!fp) {
printf("Failed to fopen %s\n", file_name.c_str());
throw std::runtime_error("Failed to open file");
}
fprintf(fp, "%s\n", text.c_str());
fclose(fp);
}

17
decompiler/util/FileIO.h Normal file
View File

@ -0,0 +1,17 @@
#ifndef JAK_V2_FILEIO_H
#define JAK_V2_FILEIO_H
#include <string>
#include <vector>
std::string read_text_file(const std::string& path);
std::string combine_path(const std::string& parent, const std::string& child);
std::vector<uint8_t> read_binary_file(const std::string& filename);
std::string base_name(const std::string& filename);
void write_text_file(const std::string& file_name, const std::string& text);
void init_crc();
uint32_t crc32(const uint8_t* data, size_t size);
uint32_t crc32(const std::vector<uint8_t>& data);
#endif //JAK_V2_FILEIO_H

View File

@ -0,0 +1,514 @@
#include "LispPrint.h"
#include <cassert>
#include <iostream>
#include <vector>
//////// HACK - symbol table now looks up by string, which makes it really stupid and store
// all strings twice.
// should probably just remove it
/*!
* String interning
*/
std::string* SymbolTable::intern(const std::string& str) {
if (map.find(str) == map.end()) {
auto* new_string = new std::string(str);
map[str] = new_string;
return new_string;
} else {
return map[str];
}
}
/*!
* Global interned string table
*/
SymbolTable gSymbolTable;
SymbolTable::SymbolTable() {
empty_pair = std::make_shared<Form>();
empty_pair->kind = FormKind::EMPTY_LIST;
}
SymbolTable::~SymbolTable() {
for (const auto& kv : map)
delete kv.second;
}
/*!
* Convert a form to a one-line string.
*/
std::string Form::toStringSimple() {
std::string result;
buildStringSimple(result);
return result;
}
void Form::buildStringSimple(std::string &str) {
std::vector<FormToken> tokens;
toTokenList(tokens);
for(auto& token : tokens) {
switch(token.kind) {
case TokenKind::WHITESPACE:
str.push_back(' ');
break;
case TokenKind::SYMBOL:
str.append(*token.str);
break;
case TokenKind::OPEN_PAREN:
str.push_back('(');
break;
case TokenKind::DOT:
str.push_back('.');
break;
case TokenKind::CLOSE_PAREN:
str.push_back(')');
break;
case TokenKind::EMPTY_PAIR:
str.append("()");
break;
case TokenKind::SPECIAL_SYMBOL:
str.append(*token.str);
break;
default:
throw std::runtime_error("buildStringSimple unknown token kind");
}
}
}
void Form::toTokenList(std::vector<FormToken> &tokens) {
switch(kind) {
case FormKind::SYMBOL:
tokens.emplace_back(TokenKind::SYMBOL, symbol);
break;
case FormKind::PAIR:
{
tokens.emplace_back(TokenKind::OPEN_PAREN);
Form* toPrint = this;
for(;;) {
if(toPrint->kind == FormKind::PAIR) {
toPrint->pair[0]->toTokenList(tokens); // print CAR
toPrint = toPrint->pair[1].get();
if(toPrint->kind == FormKind::EMPTY_LIST) {
tokens.emplace_back(TokenKind::CLOSE_PAREN);
return;
} else {
tokens.emplace_back(TokenKind::WHITESPACE);
}
} else { // not a proper list!
tokens.emplace_back(TokenKind::DOT);
tokens.emplace_back(TokenKind::WHITESPACE);
toPrint->toTokenList(tokens);
tokens.emplace_back(TokenKind::CLOSE_PAREN);
return;
}
}
}
break;
case FormKind::EMPTY_LIST:
tokens.emplace_back(TokenKind::EMPTY_PAIR);
break;
default:
throw std::runtime_error("unhandled form type in buildSimpleString");
break;
}
}
///////////////////
// Pretty Printer
///////////////////
/*!
* Linked list node representing a token in the output (whitespace, paren, newline, etc)
*/
struct PrettyPrinterNode {
FormToken* tok = nullptr; // if we aren't a newline, we will have a token.
int line = -1; // line that token occurs on. undef for newlines
int lineIndent = -1; // indent of line. only valid for first token in the line
int offset = -1; // offset of beginning of token from left margin
int specialIndentDelta = 0;
bool is_line_separator = false; // true if line separator (not a token)
PrettyPrinterNode *next = nullptr, *prev = nullptr; // linked list
PrettyPrinterNode *paren = nullptr; // pointer to open paren if in parens. open paren points to close and vice versa
explicit PrettyPrinterNode(FormToken& _tok) {
tok = &_tok;
}
PrettyPrinterNode() = default;
};
/*!
* Splice in a line break after the given node, it there isn't one already and if it isn't the last node.
*/
static void insertNewlineAfter(PrettyPrinterNode* node, int specialIndentDelta) {
if(node->next && !node->next->is_line_separator) {
auto* nl = new PrettyPrinterNode;
auto* next = node->next;
node->next = nl;
nl->prev = node;
nl->next = next;
next->prev = nl;
nl->is_line_separator = true;
nl->specialIndentDelta = specialIndentDelta;
}
}
/*!
* Splice in a line break before the given node, if there isn't one already and if it isn't the first node.
*/
static void insertNewlineBefore(PrettyPrinterNode* node, int specialIndentDelta) {
if(node->prev && !node->prev->is_line_separator) {
auto* nl = new PrettyPrinterNode;
auto* prev = node->prev;
prev->next = nl;
nl->prev = prev;
nl->next = node;
node->prev = nl;
nl->is_line_separator = true;
nl->specialIndentDelta = specialIndentDelta;
}
}
/*!
* Break a list across multiple lines. This is the fundamental reducing operation of this algorithm
*/
static void breakList(PrettyPrinterNode* leftParen) {
assert(!leftParen->is_line_separator);
assert(leftParen->tok->kind == TokenKind::OPEN_PAREN);
auto* rp = leftParen->paren;
assert(rp->tok->kind == TokenKind::CLOSE_PAREN);
for(auto* n = leftParen->next; n && n != rp; n = n->next) {
if(!n->is_line_separator) {
if(n->tok->kind == TokenKind::OPEN_PAREN) {
n = n->paren;
assert(n->tok->kind == TokenKind::CLOSE_PAREN);
insertNewlineAfter(n, 0);
} else if(n->tok->kind != TokenKind::WHITESPACE) {
assert(n->tok->kind != TokenKind::CLOSE_PAREN);
insertNewlineAfter(n, 0);
}
}
}
}
/*!
* Compute proper line numbers, offsets, and indents for a list of tokens with newlines
* Will add newlines for close parens if needed.
*/
static PrettyPrinterNode* propagatePretty(PrettyPrinterNode* list, int line_length) {
// propagate line numbers
PrettyPrinterNode* rv = nullptr;
int line = list->line;
for(auto* n = list; n; n = n->next) {
if(n->is_line_separator) {
line++;
} else {
n->line = line;
// add the weird newline.
if(n->tok->kind == TokenKind::CLOSE_PAREN) {
if(n->line != n->paren->line) {
if(n->prev && !n->prev->is_line_separator) {
insertNewlineBefore(n, 0);
line++;
}
if(n->next && !n->next->is_line_separator) {
insertNewlineAfter(n, 0);
}
}
}
}
}
// compute offsets and indents
std::vector<int> indentStack;
indentStack.push_back(0);
int offset = 0;
PrettyPrinterNode* line_start = list;
bool previous_line_sep = false;
for(auto* n = list; n; n = n->next) {
if(n->is_line_separator) {
previous_line_sep = true;
offset = indentStack.back() += n->specialIndentDelta;
} else {
if(previous_line_sep) {
line_start = n;
n->lineIndent = offset;
previous_line_sep = false;
}
n->offset = offset;
offset += n->tok->toString().length();
if(offset > line_length && !rv) rv = line_start;
if(n->tok->kind == TokenKind::OPEN_PAREN) {
if(!n->prev || n->prev->is_line_separator) {
indentStack.push_back(offset + 1);
} else {
indentStack.push_back(offset - 1);
}
}
if(n->tok->kind == TokenKind::CLOSE_PAREN) {
indentStack.pop_back();
}
}
}
return rv;
}
/*!
* Get the token on the start of the next line. nullptr if we're the last line.
*/
static PrettyPrinterNode* getNextLine(PrettyPrinterNode* start) {
assert(!start->is_line_separator);
int line = start->line;
for(;;) {
if(start->is_line_separator || start->line == line) {
if(start->next)
start = start->next;
else
return nullptr;
} else {
break;
}
}
return start;
}
/*!
* Get the next open paren on the current line (can start in the middle of line, not inclusive of start)
* nullptr if there's no open parens on the rest of this line.
*/
static PrettyPrinterNode* getNextListOnLine(PrettyPrinterNode* start) {
int line = start->line;
assert(!start->is_line_separator);
if(!start->next || start->next->is_line_separator) return nullptr;
start = start->next;
while(!start->is_line_separator && start->line == line) {
if(start->tok->kind == TokenKind::OPEN_PAREN) return start;
if(!start->next) return nullptr;
start = start->next;
}
return nullptr;
}
/*!
* Get the first open paren on the current line (can start in the middle of line, inclusive of start)
* nullptr if there's no open parens on the rest of this line
*/
static PrettyPrinterNode* getFirstListOnLine(PrettyPrinterNode* start) {
int line = start->line;
assert(!start->is_line_separator);
while(!start->is_line_separator && start->line == line) {
if(start->tok->kind == TokenKind::OPEN_PAREN) return start;
if(!start->next) return nullptr;
start = start->next;
}
return nullptr;
}
/*!
* Get the first token on the first line which exceeds the max length
*/
static PrettyPrinterNode* getFirstBadLine(PrettyPrinterNode* start, int line_length) {
assert(!start->is_line_separator);
int currentLine = start->line;
auto* currentLineNode = start;
for(;;) {
if(start->is_line_separator) {
assert(start->next);
start = start->next;
} else {
if(start->line != currentLine) {
currentLine = start->line;
currentLineNode = start;
}
if(start->offset > line_length) {
return currentLineNode;
}
if(!start->next) {
return nullptr;
}
start = start->next;
}
}
}
/*!
* Break insertion algorithm.
*/
static void insertBreaksAsNeeded(PrettyPrinterNode* head, int line_length) {
PrettyPrinterNode* last_line_complete = nullptr;
PrettyPrinterNode* line_to_start_line_search = head;
// loop over lines
for(;;) {
// compute lines as needed
propagatePretty(head, line_length);
// search for a bad line starting at the last line we fixed
PrettyPrinterNode* candidate_line = getFirstBadLine(line_to_start_line_search, line_length);
// if we got the same line we started on, this means we couldn't fix it.
if(candidate_line == last_line_complete) {
candidate_line = nullptr; // so we say our candidate was bad and try to find another
PrettyPrinterNode* next_line = getNextLine(line_to_start_line_search);
if(next_line) {
candidate_line = getFirstBadLine(next_line, line_length);
}
}
if(!candidate_line) break;
// okay, we have a line which needs fixing.
assert(!candidate_line->prev || candidate_line->prev->is_line_separator);
PrettyPrinterNode* form_to_start = getFirstListOnLine(candidate_line);
for(;;) {
if(!form_to_start) {
printf("pretty printer has failed. Fix the bug or increase the the line length.\n");
assert(false);
}
breakList(form_to_start);
propagatePretty(head, line_length);
if(getFirstBadLine(candidate_line, line_length) != candidate_line) {
break;
}
form_to_start = getNextListOnLine(form_to_start);
if(!form_to_start) break;
}
last_line_complete = candidate_line;
line_to_start_line_search = candidate_line;
}
}
static void insertSpecialBreaks(PrettyPrinterNode* node) {
for(; node; node = node->next) {
if(!node->is_line_separator && node->tok->kind == TokenKind::SYMBOL) {
std::string& name = *node->tok->str;
if(name == "deftype") {
auto* parent_type_dec = getNextListOnLine(node);
if(parent_type_dec) {
insertNewlineAfter(parent_type_dec->paren, 0);
}
}
}
}
}
std::string Form::toStringPretty(int indent, int line_length) {
(void)indent;
(void)line_length;
std::vector<FormToken> tokens;
toTokenList(tokens);
assert(!tokens.empty());
std::string pretty;
// build linked list of nodes
PrettyPrinterNode* head = new PrettyPrinterNode(tokens[0]);
PrettyPrinterNode* node = head;
head->line = 0;
head->offset = 0;
head->lineIndent = 0;
int offset = head->tok->toString().length();
for(size_t i = 1; i < tokens.size(); i++) {
node->next = new PrettyPrinterNode(tokens[i]);
node->next->prev = node;
node = node->next;
node->line = 0;
node->offset = offset;
offset += node->tok->toString().length();
node->lineIndent = 0;
}
// attach parens.
std::vector<PrettyPrinterNode*> parenStack;
parenStack.push_back(nullptr);
for(PrettyPrinterNode* n = head; n; n = n->next) {
if(n->tok->kind == TokenKind::OPEN_PAREN) {
parenStack.push_back(n);
} else if(n->tok->kind == TokenKind::CLOSE_PAREN) {
n->paren = parenStack.back();
parenStack.back()->paren = n;
parenStack.pop_back();
} else {
n->paren = parenStack.back();
}
}
assert(parenStack.size() == 1);
assert(!parenStack.back());
insertSpecialBreaks(head);
propagatePretty(head, line_length);
insertBreaksAsNeeded(head, line_length);
// write to string
bool newline_prev = true;
for(PrettyPrinterNode* n = head; n; n = n->next) {
if(n->is_line_separator){
pretty.push_back('\n');
newline_prev = true;
} else {
if(newline_prev) {
pretty.append(n->lineIndent, ' ');
newline_prev = false;
if(n->tok->kind == TokenKind::WHITESPACE) continue;
}
pretty.append(n->tok->toString());
}
}
for(;;) {
if(!head) break;
auto* next = head->next;
delete head;
head = next;
}
return pretty;
}
std::shared_ptr<Form> toForm(const std::string& str) {
auto f = std::make_shared<Form>();
f->kind = FormKind::SYMBOL;
f->symbol = gSymbolTable.intern(str);
return f;
}
std::shared_ptr<Form> buildList(std::shared_ptr<Form> form) {
auto f = std::make_shared<Form>();
f->kind = FormKind::PAIR;
f->pair[0] = form;
f->pair[1] = gSymbolTable.getEmptyPair();
return f;
}
std::shared_ptr<Form> buildList(const std::string& str) {
return buildList(toForm(str));
}
std::shared_ptr<Form> buildList(std::shared_ptr<Form>* forms, int count) {
auto f = std::make_shared<Form>();
f->kind = FormKind::PAIR;
f->pair[0] = forms[0];
if(count - 1) {
f->pair[1] = buildList(forms + 1, count - 1);
} else {
f->pair[1] = gSymbolTable.getEmptyPair();
}
return f;
}
std::shared_ptr<Form> buildList(std::vector<std::shared_ptr<Form>>& forms) {
if(forms.empty()) {
return gSymbolTable.getEmptyPair();
}
return buildList(forms.data(), forms.size());
}

142
decompiler/util/LispPrint.h Normal file
View File

@ -0,0 +1,142 @@
#ifndef JAK2_DISASSEMBLER_LISPPRINT_H
#define JAK2_DISASSEMBLER_LISPPRINT_H
#include <memory>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
/*!
* What type of thing is it?
*/
enum class FormKind {
SYMBOL,
HEX_NUMBER,
DECIMAL_NUMBER,
BINARY_NUMBER,
SIGNED_NUMBER,
STRING,
EMPTY_LIST,
PAIR
};
/*!
* Tokens in a textual representation
*/
enum class TokenKind {
WHITESPACE,
SYMBOL,
OPEN_PAREN,
DOT,
CLOSE_PAREN,
EMPTY_PAIR,
SPECIAL_SYMBOL
};
/*!
* Token in a text representation
*/
struct FormToken {
explicit FormToken(TokenKind _kind, std::string* _str = nullptr) : kind(_kind), str(_str) {}
TokenKind kind;
union {
std::string* str;
};
std::string toString() {
std::string s;
switch (kind) {
case TokenKind::WHITESPACE:
s.push_back(' ');
break;
case TokenKind::SYMBOL:
s.append(*str);
break;
case TokenKind::OPEN_PAREN:
s.push_back('(');
break;
case TokenKind::DOT:
s.push_back('.');
break;
case TokenKind::CLOSE_PAREN:
s.push_back(')');
break;
case TokenKind::EMPTY_PAIR:
s.append("()");
break;
case TokenKind::SPECIAL_SYMBOL:
s.append(*str);
break;
default:
throw std::runtime_error("toString unknown token kind");
}
return s;
}
};
/*!
* S-Expression Form
*/
class Form {
public:
FormKind kind;
std::string* symbol;
std::shared_ptr<Form> pair[2];
std::string toStringSimple();
std::string toStringPretty(int indent = 0, int line_length = 80);
void toTokenList(std::vector<FormToken>& tokens);
private:
void buildStringSimple(std::string& str);
};
/*!
* Symbol table to reduce the number of strings everywhere.
*/
class SymbolTable {
public:
SymbolTable();
std::string* intern(const std::string& str);
~SymbolTable();
std::shared_ptr<Form> getEmptyPair() { return empty_pair; }
private:
std::unordered_map<std::string, std::string*> map;
std::shared_ptr<Form> empty_pair;
};
/*!
* Global symbol table used for the compiler/decompiler
*/
extern SymbolTable gSymbolTable;
std::shared_ptr<Form> toForm(const std::string& str); //
std::shared_ptr<Form> buildList(const std::string& str);
std::shared_ptr<Form> buildList(std::shared_ptr<Form> form);
std::shared_ptr<Form> buildList(std::vector<std::shared_ptr<Form>>& forms);
std::shared_ptr<Form> buildList(std::shared_ptr<Form>* forms, int count);
template <typename... Args>
std::shared_ptr<Form> buildList(const std::string& str, Args... rest) {
auto f = std::make_shared<Form>();
f->kind = FormKind::PAIR;
f->pair[0] = toForm(str);
f->pair[1] = buildList(rest...);
return f;
}
template <typename... Args>
std::shared_ptr<Form> buildList(std::shared_ptr<Form> car, Args... rest) {
auto f = std::make_shared<Form>();
f->kind = FormKind::PAIR;
f->pair[0] = car;
f->pair[1] = buildList(rest...);
return f;
}
#endif // JAK2_DISASSEMBLER_LISPPRINT_H

54
decompiler/util/Timer.cpp Normal file
View File

@ -0,0 +1,54 @@
#include "Timer.h"
#ifdef _WIN32
#include <Windows.h>
#define MS_PER_SEC 1000ULL // MS = milliseconds
#define US_PER_MS 1000ULL // US = microseconds
#define HNS_PER_US 10ULL // HNS = hundred-nanoseconds (e.g., 1 hns = 100 ns)
#define NS_PER_US 1000ULL
#define HNS_PER_SEC (MS_PER_SEC * US_PER_MS * HNS_PER_US)
#define NS_PER_HNS (100ULL) // NS = nanoseconds
#define NS_PER_SEC (MS_PER_SEC * US_PER_MS * NS_PER_US)
int Timer::clock_gettime_monotonic(struct timespec* tv) {
static LARGE_INTEGER ticksPerSec;
LARGE_INTEGER ticks;
double seconds;
if (!ticksPerSec.QuadPart) {
QueryPerformanceFrequency(&ticksPerSec);
if (!ticksPerSec.QuadPart) {
errno = ENOTSUP;
return -1;
}
}
QueryPerformanceCounter(&ticks);
seconds = (double)ticks.QuadPart / (double)ticksPerSec.QuadPart;
tv->tv_sec = (time_t)seconds;
tv->tv_nsec = (long)((ULONGLONG)(seconds * NS_PER_SEC) % NS_PER_SEC);
return 0;
}
#endif
void Timer::start() {
#ifdef __linux__
clock_gettime(CLOCK_MONOTONIC, &_startTime);
#elif _WIN32
clock_gettime_monotonic(&_startTime);
#endif
}
int64_t Timer::getNs() {
struct timespec now = {};
#ifdef __linux__
clock_gettime(CLOCK_MONOTONIC, &now);
#elif _WIN32
clock_gettime_monotonic(&now);
#endif
return (int64_t)(now.tv_nsec - _startTime.tv_nsec) +
1000000000 * (now.tv_sec - _startTime.tv_sec);
}

47
decompiler/util/Timer.h Normal file
View File

@ -0,0 +1,47 @@
#ifndef JAK_V2_TIMER_H
#define JAK_V2_TIMER_H
#include <cassert>
#include <cstdint>
#include <ctime>
/*!
* Timer for measuring time elapsed with clock_monotonic
*/
class Timer {
public:
/*!
* Construct and start timer
*/
explicit Timer() { start(); }
#ifdef _WIN32
int clock_gettime_monotonic(struct timespec* tv);
#endif
/*!
* Start the timer
*/
void start();
/*!
* Get milliseconds elapsed
*/
double getMs() { return (double)getNs() / 1.e6; }
double getUs() { return (double)getNs() / 1.e3; }
/*!
* Get nanoseconds elapsed
*/
int64_t getNs();
/*!
* Get seconds elapsed
*/
double getSeconds() { return (double)getNs() / 1.e9; }
struct timespec _startTime = {};
};
#endif // JAK_V2_TIMER_H

2
decompiler_out/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore