diff --git a/main.cpp b/main.cpp index 9a25cbb..8a4ed20 100644 --- a/main.cpp +++ b/main.cpp @@ -30,6 +30,7 @@ static vector read_binary(const char *path, bool flip) return v; } +#if 0 static bool read_tag_validate(FILE *file, const char *tag) { char tmp[9] = {}; @@ -289,10 +290,11 @@ static void validate_trace(RSP::CPU &cpu, const char *path) fclose(file); } +#endif int main(int argc, char *argv[]) { - RSP::CPU cpu; + RSP::JIT::CPU cpu; auto &state = cpu.get_state(); uint32_t cr[16] = {}; @@ -318,8 +320,10 @@ int main(int argc, char *argv[]) cpu.run(); } } +#if 0 else if (argc == 2) validate_trace(cpu, argv[1]); +#endif else return 1; } diff --git a/rsp_jit.cpp b/rsp_jit.cpp index 1728de6..c382426 100644 --- a/rsp_jit.cpp +++ b/rsp_jit.cpp @@ -1,12 +1,23 @@ #include "rsp_jit.hpp" #include +#include using namespace std; +// We're only guaranteed 3 V registers (x86). #define JIT_REGISTER_SELF JIT_V0 #define JIT_REGISTER_STATE JIT_V1 +#define JIT_REGISTER_DMEM JIT_V2 + #define JIT_REGISTER_MODE JIT_R1 #define JIT_REGISTER_NEXT_PC JIT_R0 + +// Freely used to implement instructions. +#define JIT_REGISTER_TMP0 JIT_R0 +#define JIT_REGISTER_TMP1 JIT_R1 + +// We're only guaranteed 3 R registers (x86). +#define JIT_REGISTER_COND_BRANCH_TAKEN JIT_R(JIT_R_NUM - 1) #define JIT_FRAME_SIZE 256 namespace RSP @@ -72,6 +83,9 @@ uint64_t CPU::hash_imem(unsigned pc, unsigned count) const unsigned CPU::analyze_static_end(unsigned pc, unsigned end) { // Scans through IMEM and finds the logical "end" of the instruction stream. + // A logical end of the instruction stream is where execution must terminate. + // If we have forward branches into this block, i.e. gotos, they extend the execution stream. + // However, we cannot execute beyond end. unsigned max_static_pc = pc; unsigned count = end - pc; @@ -100,7 +114,8 @@ unsigned CPU::analyze_static_end(unsigned pc, unsigned end) switch (instr & 63) { case 010: - // JR always terminates either by returning or exiting. + case 011: + // JR and JALR always terminate execution of the block. // We execute the next instruction via delay slot and exit. // Unless we can branch past the JR // (max_static_pc will be higher than expected), @@ -130,6 +145,7 @@ unsigned CPU::analyze_static_end(unsigned pc, unsigned end) case 001: // BGEZ case 021: // BGEZAL case 020: // BLTZAL + // TODO/Optimization: Handle static branch case where $0 is used. target = (pc + i + 1 + instr) & 0x3ff; if (target >= pc && target < end) // goto max_static_pc = max(max_static_pc, target + 1); @@ -140,8 +156,9 @@ unsigned CPU::analyze_static_end(unsigned pc, unsigned end) } break; - case 002: - // J is resolved by goto. + case 002: // J + case 003: // JAL + // J is resolved by goto. Same with JAL if call target happens to be inside the block. target = instr & 0x3ff; if (target >= pc && target < end) // goto { @@ -169,6 +186,7 @@ unsigned CPU::analyze_static_end(unsigned pc, unsigned end) case 005: // BNE case 006: // BLEZ case 007: // BGTZ + // TODO/Optimization: Handle static branch case where $0 is used. target = (pc + i + 1 + instr) & 0x3ff; if (target >= pc && target < end) // goto max_static_pc = max(max_static_pc, target + 1); @@ -207,6 +225,7 @@ void CPU::init_jit_thunks() jit_getarg(JIT_REGISTER_SELF, self); jit_getarg(JIT_REGISTER_STATE, state); jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE, offsetof(CPUState, pc)); + jit_ldxi(JIT_REGISTER_DMEM, JIT_REGISTER_STATE, offsetof(CPUState, dmem)); jit_movi(JIT_REGISTER_MODE, MODE_ENTER); auto *entry_label = jit_indirect(); @@ -272,30 +291,641 @@ int CPU::enter(uint32_t pc) return thunks.enter_frame(this, &state); } -Func CPU::jit_region(uint64_t hash, unsigned pc, unsigned count) +void CPU::jit_end_of_block(jit_state_t *_jit, uint32_t pc, const CPU::InstructionInfo &last_info) { + // If we run off the end of a block with a pending delay slot, we need to move it to CPUState. + // We always branch to the next PC, and the delay slot will be handled after the first instruction in next block. + auto *forward = jit_forward(); + if (last_info.branch) + { + if (last_info.conditional) + jit_patch_at(jit_beqi(JIT_REGISTER_COND_BRANCH_TAKEN, 0), forward); + + if (last_info.indirect) + jit_ldxi_i(JIT_REGISTER_TMP0, JIT_REGISTER_STATE, offsetof(CPUState, sr) + 4 * last_info.branch_target); + else + jit_movi(JIT_REGISTER_TMP0, last_info.branch_target); + jit_stxi_i(offsetof(CPUState, branch_target), JIT_REGISTER_STATE, JIT_REGISTER_TMP0); + jit_movi(JIT_REGISTER_TMP0, 1); + jit_stxi_i(offsetof(CPUState, has_delay_slot), JIT_REGISTER_STATE, JIT_REGISTER_TMP0); + } + + jit_link(forward); + jit_movi(JIT_REGISTER_NEXT_PC, pc); + jit_patch_abs(jit_jmpi(), thunks.enter_thunk); +} + +void CPU::jit_handle_delay_slot(jit_state_t *_jit, const InstructionInfo &last_info, + jit_node_t **local_targets, uint32_t base_pc, uint32_t end_pc) +{ + if (last_info.conditional) + { + if (!last_info.indirect && last_info.branch_target >= base_pc && last_info.branch_target < end_pc) + { + jit_patch_at(jit_bnei(JIT_REGISTER_COND_BRANCH_TAKEN, 0), local_targets[(last_info.branch_target - base_pc) >> 2]); + } + else + { + auto *no_branch = jit_bnei(JIT_REGISTER_COND_BRANCH_TAKEN, 0); + if (last_info.indirect) + jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE, offsetof(CPUState, sr) + 4 * last_info.branch_target); + else + jit_movi(JIT_REGISTER_NEXT_PC, last_info.branch_target); + jit_patch_abs(jit_jmpi(), thunks.enter_thunk); + jit_patch(no_branch); + } + } + else + { + if (!last_info.indirect && last_info.branch_target >= base_pc && last_info.branch_target < end_pc) + { + jit_patch_at(jit_jmpi(), local_targets[(last_info.branch_target - base_pc) >> 2]); + } + else + { + if (last_info.indirect) + jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE,offsetof(CPUState, sr) + 4 * last_info.branch_target); + else + jit_movi(JIT_REGISTER_NEXT_PC, last_info.branch_target); + jit_patch_abs(jit_jmpi(), thunks.enter_thunk); + } + } +} + +void CPU::jit_exit(jit_state_t *_jit, uint32_t pc, const InstructionInfo &last_info, ReturnMode mode, bool first_instruction) +{ + if (first_instruction) + { + // Need to consider that we need to move delay slot to PC. + jit_ldxi_i(JIT_REGISTER_TMP0, JIT_REGISTER_STATE, offsetof(CPUState, has_delay_slot)); + + auto *latent_delay_slot = jit_forward(); + jit_patch_at(jit_bnei(JIT_REGISTER_TMP0, 0), latent_delay_slot); + + // Common case. + // Immediately exit. + jit_movi(JIT_REGISTER_MODE, mode); + jit_movi(JIT_REGISTER_NEXT_PC, pc + 4); + auto *jmp = jit_jmpi(); + jit_patch_abs(jmp, thunks.return_thunk); + + // If we had a latent delay slot, we handle it here. + jit_link(latent_delay_slot); + // We cannot execute a branch inside a delay slot, so just assume we do not have to chain together these. + // We could technically handle it, but it gets messy (and it's illegal MIPS), so don't bother. + jit_movi(JIT_REGISTER_NEXT_PC, 0); + jit_stxi_i(offsetof(CPUState, has_delay_slot), JIT_REGISTER_STATE, JIT_REGISTER_NEXT_PC); + jit_movi(JIT_REGISTER_MODE, mode); + jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE, offsetof(CPUState, branch_target)); + } + else if (!last_info.branch) + { + // Immediately exit. + jit_movi(JIT_REGISTER_MODE, mode); + jit_movi(JIT_REGISTER_NEXT_PC, pc + 4); + } + else if (!last_info.indirect && !last_info.conditional) + { + // Redirect PC to whatever value we were supposed to branch to. + jit_movi(JIT_REGISTER_MODE, mode); + jit_movi(JIT_REGISTER_NEXT_PC, last_info.branch_target); + } + else if (!last_info.conditional) + { + // We have an indirect branch, load that register into PC. + jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE, offsetof(CPUState, sr) + 4 * last_info.branch_target); + jit_movi(JIT_REGISTER_MODE, mode); + } + else if (last_info.indirect) + { + // Indirect conditional branch. + auto *node = jit_beqi(JIT_REGISTER_COND_BRANCH_TAKEN, 0); + jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE, offsetof(CPUState, sr) + 4 * last_info.branch_target); + auto *to_end = jit_jmpi(); + jit_patch(node); + jit_movi(JIT_REGISTER_NEXT_PC, pc + 4); + jit_patch(to_end); + } + else + { + // Direct conditional branch. + auto *node = jit_beqi(JIT_REGISTER_COND_BRANCH_TAKEN, 0); + jit_movi(JIT_REGISTER_NEXT_PC, last_info.branch_target); + auto *to_end = jit_jmpi(); + jit_patch(node); + jit_movi(JIT_REGISTER_NEXT_PC, pc + 4); + jit_patch(to_end); + } + + auto *jmp = jit_jmpi(); + jit_patch_abs(jmp, thunks.return_thunk); +} + +void CPU::jit_load_register(jit_state_t *_jit, unsigned jit_register, unsigned mips_register) +{ + if (mips_register == 0) + jit_movi(jit_register, 0); + else + jit_ldxi_i(jit_register, JIT_REGISTER_STATE, offsetof(CPUState, sr) + 4 * mips_register); +} + +void CPU::jit_store_register(jit_state_t *_jit, unsigned jit_register, unsigned mips_register) +{ + assert(mips_register != 0); + jit_stxi_i(offsetof(CPUState, sr) + 4 * mips_register, JIT_REGISTER_STATE, jit_register); +} + +#define DISASM(asmfmt, ...) do { \ + char buf[1024]; \ + sprintf(buf, "0x%03x " asmfmt, pc, __VA_ARGS__); \ + mips_disasm += buf; \ +} while(0) + +#define DISASM_NOP() do { \ + char buf[1024]; \ + sprintf(buf, "0x%03x nop\n", pc); \ + mips_disasm += buf; \ +} while(0) + +void CPU::jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, + InstructionInfo &info, const InstructionInfo &last_info, + bool first_instruction) +{ + // VU + if ((instr >> 25) == 0x25) + { + return; + } + + // TODO: Meaningful register allocation. + // For now, always flush register state to memory after an instruction for simplicity. + // Should be red-hot in L1 cache, so probably won't be that bad. + // On x86, we unfortunately have an anemic register bank to work with. + + uint32_t type = instr >> 26; + +#define NOP_IF_RD_ZERO() if (rd == 0) { DISASM_NOP(); break; } +#define NOP_IF_RT_ZERO() if (rt == 0) { DISASM_NOP(); break; } + + switch (type) + { + case 000: + { + auto rd = (instr >> 11) & 31; + auto rt = (instr >> 16) & 31; + auto shift = (instr >> 6) & 31; + auto rs = (instr >> 21) & 31; + + switch (instr & 63) + { + case 000: // SLL + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_lshi(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, shift); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("sll r%u, r%u, %u\n", rd, rt, shift); + break; + } + + case 002: // SRL + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_rshi_u(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, shift); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("srl r%u, r%u, %u\n", rd, rt, shift); + break; + } + + case 003: // SRA + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_rshi(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, shift); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("sra r%u, r%u, %u\n", rd, rt, shift); + break; + } + + case 004: // SLLV + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_andi(JIT_REGISTER_TMP1, JIT_REGISTER_TMP1, 31); + jit_lshr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("sllv r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 006: // SRLV + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_andi(JIT_REGISTER_TMP1, JIT_REGISTER_TMP1, 31); + jit_rshr_u(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("srlv r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 007: // SRAV + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_andi(JIT_REGISTER_TMP1, JIT_REGISTER_TMP1, 31); + jit_rshr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("srav r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 010: // JR + DISASM("jr %u\n", 0); + break; + case 011: // JALR + DISASM("jalr %u\n", 0); + break; + + case 015: // BREAK + { + jit_exit(_jit, pc, last_info, MODE_BREAK, first_instruction); + info.handles_delay_slot = true; + DISASM("break %u\n", 0); + break; + } + + case 040: // ADD + case 041: // ADDU + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_addr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("addu r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 042: // SUB + case 043: // SUBU + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_subr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("subu r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 044: // AND + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_andr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("and r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 045: // OR + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_orr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("or r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 046: // XOR + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_xorr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("xor r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 047: // NOR + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rt); + jit_load_register(_jit, JIT_REGISTER_TMP1, rs); + jit_orr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_xori(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, jit_word_t(-1)); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("nor r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 052: // SLT + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_load_register(_jit, JIT_REGISTER_TMP1, rt); + jit_ltr(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("slt r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + case 053: // SLTU + { + NOP_IF_RD_ZERO(); + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_load_register(_jit, JIT_REGISTER_TMP1, rt); + jit_ltr_u(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, JIT_REGISTER_TMP1); + jit_store_register(_jit, JIT_REGISTER_TMP0, rd); + DISASM("sltu r%u, r%u, r%u\n", rd, rt, rs); + break; + } + + default: + break; + } + break; + } + + case 001: // REGIMM + { + //unsigned rs = (instr >> 21) & 31; + unsigned rt = (instr >> 16) & 31; + + switch (rt) + { + case 020: // BLTZAL + DISASM("bltzal %u\n", 0); + break; + + case 000: // BLTZ + DISASM("bltz %u\n", 0); + break; + + case 021: // BGEZAL + DISASM("bgezal %u\n", 0); + break; + + case 001: // BGEZ + DISASM("bgez %u\n", 0); + break; + } + break; + } + + case 003: // JAL + { + uint32_t target_pc = (instr & 0x3ffu) << 2; + jit_movi(JIT_REGISTER_TMP0, pc + 8); + jit_store_register(_jit, JIT_REGISTER_TMP0, 31); + info.branch = true; + info.branch_target = target_pc; + DISASM("jal 0x%03x\n", target_pc); + break; + } + + case 002: // J + { + uint32_t target_pc = (instr & 0x3ffu) << 2; + info.branch = true; + info.branch_target = target_pc; + DISASM("j 0x%03x\n", target_pc); + break; + } + + case 004: // BEQ + DISASM("beq %u\n", 0); + break; + + case 005: // BNE + DISASM("bne %u\n", 0); + break; + + case 006: // BLEZ + DISASM("blez %u\n", 0); + break; + + case 007: // BGTZ + DISASM("bgtz %u\n", 0); + break; + + case 010: // ADDI + case 011: + { + unsigned rt = (instr >> 16) & 31; + NOP_IF_RT_ZERO(); + int16_t simm = int16_t(instr); + unsigned rs = (instr >> 21) & 31; + + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_addi(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, simm); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("addi r%u, r%u, %d\n", rt, rs, simm); + break; + } + + case 012: // SLTI + { + unsigned rt = (instr >> 16) & 31; + NOP_IF_RT_ZERO(); + int16_t simm = int16_t(instr); + unsigned rs = (instr >> 21) & 31; + + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_lti(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, simm); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("slti r%u, r%u, %d\n", rt, rs, simm); + break; + } + + case 013: // SLTIU + { + unsigned rt = (instr >> 16) & 31; + NOP_IF_RT_ZERO(); + uint16_t imm = uint16_t(instr); + unsigned rs = (instr >> 21) & 31; + + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_lti_u(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, imm); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("sltiu r%u, r%u, %u\n", rt, rs, imm); + break; + } + + case 014: // ANDI + { + unsigned rt = (instr >> 16) & 31; + NOP_IF_RT_ZERO(); + unsigned rs = (instr >> 21) & 31; + uint16_t imm = uint16_t(instr); + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_andi(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, imm); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("andi r%u, r%u, %u\n", rt, rs, imm); + break; + } + + case 015: // ORI + { + unsigned rt = (instr >> 16) & 31; + NOP_IF_RT_ZERO(); + unsigned rs = (instr >> 21) & 31; + uint16_t imm = uint16_t(instr); + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_ori(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, imm); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("ori r%u, r%u, %u\n", rt, rs, imm); + break; + } + + case 016: // XORI + { + unsigned rt = (instr >> 16) & 31; + if (rt == 0) + break; + unsigned rs = (instr >> 21) & 31; + uint16_t imm = uint16_t(instr); + jit_load_register(_jit, JIT_REGISTER_TMP0, rs); + jit_xori(JIT_REGISTER_TMP0, JIT_REGISTER_TMP0, imm); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("xori r%u, r%u, %u\n", rt, rs, imm); + break; + } + + case 017: // LUI + { + unsigned rt = (instr >> 16) & 31; + NOP_IF_RT_ZERO(); + int16_t imm = int16_t(instr); + jit_movi(JIT_REGISTER_TMP0, imm << 16); + jit_store_register(_jit, JIT_REGISTER_TMP0, rt); + DISASM("lui r%u, %d\n", rt, imm); + break; + } + + case 020: // COP0 + DISASM("cop0 %u\n", 0); + break; + + case 022: // COP2 + DISASM("cop2 %u\n", 0); + break; + + case 040: // LB + DISASM("lb %u\n", 0); + break; + + case 041: // LH + DISASM("lh %u\n", 0); + break; + + case 043: // LW + DISASM("lw %u\n", 0); + break; + + case 044: // LBU + DISASM("lbu %u\n", 0); + break; + + case 045: // LHU + DISASM("lhu %u\n", 0); + break; + + case 050: // SB + DISASM("sb %u\n", 0); + break; + + case 051: // SH + DISASM("sh %u\n", 0); + break; + + case 053: // SW + DISASM("sw %u\n", 0); + break; + + case 062: // LWC2 + DISASM("lcw2 %u\n", 0); + break; + + case 072: // SWC2 + DISASM("swc2 %u\n", 0); + break; + + default: + break; + } +} + +Func CPU::jit_region(uint64_t hash, unsigned pc_word, unsigned instruction_count) +{ + mips_disasm.clear(); jit_state_t *_jit = jit_new_state(); jit_prolog(); jit_tramp(JIT_FRAME_SIZE); - jit_movi(JIT_R0, 10); - jit_stxi_i(offsetof(CPUState, sr) + 4, JIT_REGISTER_STATE, JIT_R0); - jit_movi(JIT_R0, 20); - jit_stxi_i(offsetof(CPUState, sr) + 8, JIT_REGISTER_STATE, JIT_R0); - jit_movi(JIT_R0, 30); - jit_stxi_i(offsetof(CPUState, sr) + 12, JIT_REGISTER_STATE, JIT_R0); - jit_movi(JIT_R0, 40); - jit_stxi_i(offsetof(CPUState, sr) + 16, JIT_REGISTER_STATE, JIT_R0); - jit_movi(JIT_REGISTER_MODE, MODE_BREAK); - jit_movi(JIT_REGISTER_NEXT_PC, 4); - auto *jmp = jit_jmpi(); - jit_patch_abs(jmp, thunks.return_thunk); + // We can potentially branch to every instruction in the block, so declare forward references to them here. + jit_node_t *branch_targets[CODE_BLOCK_SIZE]; + for (unsigned i = 0; i < instruction_count; i++) + branch_targets[i] = jit_forward(); + + jit_node_t *latent_delay_slot = nullptr; + + InstructionInfo last_info = {}; + for (unsigned i = 0; i < instruction_count; i++) + { + jit_link(branch_targets[i]); + + uint32_t instr = state.imem[pc_word + i]; + InstructionInfo inst_info = {}; + jit_instruction(_jit, (pc_word + i) << 2, instr, inst_info, last_info, i == 0); + + if (i == 0 && !inst_info.handles_delay_slot) + { + // After the first instruction, we might need to resolve a latent delay slot. + latent_delay_slot = jit_forward(); + jit_ldxi_i(JIT_REGISTER_TMP0, JIT_REGISTER_STATE, offsetof(CPUState, has_delay_slot)); + jit_patch_at(jit_bnei(JIT_REGISTER_TMP0, 0), latent_delay_slot); + } + else if (i != 0 && !inst_info.handles_delay_slot && last_info.branch) + { + // Normal handling of the delay slot. + jit_handle_delay_slot(_jit, last_info, branch_targets, + pc_word << 2, + (pc_word + instruction_count) << 2); + } + last_info = inst_info; + } + + // Jump to another block. + jit_end_of_block(_jit, (pc_word + instruction_count) << 2, last_info); + + // If we had a latent delay slot, we handle it here. + if (latent_delay_slot) + { + jit_link(latent_delay_slot); + // We cannot execute a branch inside a delay slot, so just assume we do not have to chain together these. + // We could technically handle it, but it gets messy (and it's illegal MIPS), so don't bother. + jit_movi(JIT_REGISTER_NEXT_PC, 0); + jit_stxi_i(offsetof(CPUState, has_delay_slot), JIT_REGISTER_STATE, JIT_REGISTER_NEXT_PC); + jit_ldxi_i(JIT_REGISTER_NEXT_PC, JIT_REGISTER_STATE, offsetof(CPUState, branch_target)); + jit_patch_abs(jit_jmpi(), thunks.enter_thunk); + } auto ret = reinterpret_cast(jit_emit()); printf(" === DISASM ===\n"); jit_disassemble(); + printf("%s\n", mips_disasm.c_str()); printf(" === DISASM END ===\n\n"); cleanup_jit_states.push_back(_jit); return ret; @@ -328,5 +958,52 @@ ReturnMode CPU::run() } } } + +static const char *reg_names[32] = { + "zero", "at", "v0", "v1", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", +}; +#define NAME(reg) reg_names[reg] + +void CPU::print_registers() +{ + fprintf(stderr, "RSP state:\n"); + fprintf(stderr, " PC: 0x%03x\n", state.pc); + for (unsigned i = 1; i < 32; i++) + fprintf(stderr, " SR[%s] = 0x%08x\n", NAME(i), state.sr[i]); + fprintf(stderr, "\n"); + for (unsigned i = 0; i < 32; i++) + { + fprintf(stderr, " VR[%02u] = { 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x }\n", i, + state.cp2.regs[i].e[0], state.cp2.regs[i].e[1], state.cp2.regs[i].e[2], state.cp2.regs[i].e[3], + state.cp2.regs[i].e[4], state.cp2.regs[i].e[5], state.cp2.regs[i].e[6], state.cp2.regs[i].e[7]); + } + + fprintf(stderr, "\n"); + + for (unsigned i = 0; i < 3; i++) + { + static const char *strings[] = { "ACC_HI", "ACC_MD", "ACC_LO" }; + fprintf(stderr, " %s = { 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x, 0x%04x }\n", strings[i], + state.cp2.acc.e[8 * i + 0], state.cp2.acc.e[8 * i + 1], state.cp2.acc.e[8 * i + 2], + state.cp2.acc.e[8 * i + 3], state.cp2.acc.e[8 * i + 4], state.cp2.acc.e[8 * i + 5], + state.cp2.acc.e[8 * i + 6], state.cp2.acc.e[8 * i + 7]); + } + + fprintf(stderr, "\n"); + + for (unsigned i = 0; i < 3; i++) + { + static const char *strings[] = { "VCO", "VCC", "VCE" }; + uint16_t flags = rsp_get_flags(state.cp2.flags[i].e); + fprintf(stderr, " %s = 0x%04x\n", strings[i], flags); + } + + fprintf(stderr, "\n"); + fprintf(stderr, " Div Out = 0x%04x\n", state.cp2.div_out); + fprintf(stderr, " Div In = 0x%04x\n", state.cp2.div_in); + fprintf(stderr, " DP flag = 0x%04x\n", state.cp2.dp_flag); +} + } // namespace JIT } // namespace RSP \ No newline at end of file diff --git a/rsp_jit.hpp b/rsp_jit.hpp index 5fdd3d9..7918580 100644 --- a/rsp_jit.hpp +++ b/rsp_jit.hpp @@ -73,7 +73,7 @@ private: std::unordered_map cached_blocks[IMEM_WORDS]; - Func jit_region(uint64_t hash, unsigned pc, unsigned count); + Func jit_region(uint64_t hash, unsigned pc_word, unsigned instruction_count); int enter(uint32_t pc); @@ -90,6 +90,23 @@ private: } thunks; unsigned analyze_static_end(unsigned pc, unsigned end); + + struct InstructionInfo + { + uint32_t branch_target; + bool indirect; + bool branch; + bool conditional; + bool handles_delay_slot; + }; + void jit_instruction(jit_state_t *_jit, uint32_t pc, uint32_t instr, InstructionInfo &info, const InstructionInfo &last_info, + bool first_instruction); + void jit_exit(jit_state_t *_jit, uint32_t pc, const InstructionInfo &last_info, ReturnMode mode, bool first_instruction); + void jit_end_of_block(jit_state_t *_jit, uint32_t pc, const InstructionInfo &last_info); + static void jit_load_register(jit_state_t *_jit, unsigned jit_register, unsigned mips_register); + static void jit_store_register(jit_state_t *_jit, unsigned jit_register, unsigned mips_register); + void jit_handle_delay_slot(jit_state_t *_jit, const InstructionInfo &last_info, jit_node_t **local_targets, uint32_t base_pc, uint32_t end_pc); + std::string mips_disasm; }; } // namespace JIT } // namespace RSP