Armjit: branch code cleanup #1

This commit is contained in:
Henrik Rydgard 2013-01-30 00:02:04 +01:00
parent f1ce5285ea
commit 739b76a55a
6 changed files with 145 additions and 111 deletions

View File

@ -99,12 +99,20 @@ void Jit::GenerateFixedCode()
PUSH(9, R4, R5, R6, R7, R8, R9, R10, R11, _LR);
// Take care to 8-byte align stack for function calls.
// This actually misaligns the stack within the JIT itself but that doesn't really matter
// as the JIT does not use the stack at all.
// We are misaligned here because of an odd number of args for PUSH.
// It's not like x86 where you need to account for an extra 4 bytes
// consumed by CALL.
SUB(_SP, _SP, 4);
// Now we are correctly aligned and plan to stay that way.
// TODO: R12 should be usable for regalloc but will get thrashed by C code.
// Fixed registers, these are always kept when in Jit context.
// R13 cannot be used as it's the stack pointer.
// TODO: Consider statically allocating:
// * downcount
// * R2-R4
// Really starting to run low on registers already though...
ARMABI_MOVI2R(R11, (u32)Memory::base);
ARMABI_MOVI2R(R10, (u32)mips_);
ARMABI_MOVI2R(R9, (u32)GetBlockCache()->GetCodePointers());

View File

@ -95,8 +95,12 @@ namespace MIPSComp
{
gpr.MapDirtyIn(rt, rs);
Operand2 op2;
if (TryMakeOperand2(simm, op2)) {
CMP(gpr.R(rs), op2);
bool negated;
if (TryMakeOperand2_AllowNegation(simm, op2, &negated)) {
if (!negated)
CMP(gpr.R(rs), op2);
else
CMN(gpr.R(rs), op2);
} else {
ARMABI_MOVI2R(R0, simm);
CMP(gpr.R(rs), R0);
@ -108,13 +112,18 @@ namespace MIPSComp
SetCC(CC_AL);
}
break;
/*
case 11: // R(rt) = R(rs) < uimm; break; //sltiu
{
LogBlockNumber();
gpr.MapDirtyIn(rt, rs);
Operand2 op2;
if (TryMakeOperand2(uimm, op2)) {
CMP(gpr.R(rs), op2);
bool negated;
if (TryMakeOperand2_AllowNegation(uimm, op2, &negated)) {
if (!negated)
CMP(gpr.R(rs), op2);
else
CMN(gpr.R(rs), op2);
} else {
ARMABI_MOVI2R(R0, uimm);
CMP(gpr.R(rs), R0);
@ -126,7 +135,7 @@ namespace MIPSComp
SetCC(CC_AL);
}
break;
*/
case 15: // R(rt) = uimm << 16; //lui
gpr.SetImm(rt, uimm << 16);
break;

View File

@ -37,6 +37,10 @@
#define LOOPOPTIMIZATION 0
// We can disable nice delay slots.
#define CONDITIONAL_NICE_DELAYSLOT delaySlotIsNice = false;
// #define CONDITIONAL_NICE_DELAYSLOT ;
using namespace MIPSAnalyst;
namespace MIPSComp
@ -54,14 +58,12 @@ void Jit::BranchRSRTComp(u32 op, ArmGen::CCFlags cc, bool likely)
u32 targetAddr = js.compilerPC + offset + 4;
u32 delaySlotOp = Memory::ReadUnchecked_U32(js.compilerPC+4);
//Compile the delay slot
bool delaySlotIsNice = GetOutReg(delaySlotOp) != rt && GetOutReg(delaySlotOp) != rs;
if (!delaySlotIsNice)
{
//ERROR_LOG(CPU, "Not nice delay slot in BranchRSRTComp :( %08x", js.compilerPC);
}
// The delay slot being nice doesn't really matter though...
bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rt, rs);
CONDITIONAL_NICE_DELAYSLOT;
if (!likely && delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_NICE);
// The delay slot being nice doesn't really matter though...
if (rt == 0)
{
@ -83,27 +85,18 @@ void Jit::BranchRSRTComp(u32 op, ArmGen::CCFlags cc, bool likely)
}
FlushAll();
js.inDelaySlot = true;
ArmGen::FixupBranch ptr;
if (!likely)
{
// preserve flag around the delay slot! Maybe this is not always necessary on ARM where
// we can (mostly) control whether we set the flag or not. Of course, if someone puts an slt in to the
// delay slot, we're screwed.
MRS(R8); // Save flags register. R8 is preserved through function calls and is not allocated.
CompileAt(js.compilerPC + 4);
FlushAll();
_MSR(true, false, R8); // Restore flags register
if (!delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_SAFE_FLUSH);
ptr = B_CC(cc);
}
else
{
ptr = B_CC(cc);
CompileAt(js.compilerPC + 4);
FlushAll();
CompileDelaySlot(DELAYSLOT_FLUSH);
}
js.inDelaySlot = false;
// Take the branch
WriteExit(targetAddr, 0);
@ -127,36 +120,27 @@ void Jit::BranchRSZeroComp(u32 op, ArmGen::CCFlags cc, bool andLink, bool likely
u32 targetAddr = js.compilerPC + offset + 4;
u32 delaySlotOp = Memory::ReadUnchecked_U32(js.compilerPC + 4);
bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
CONDITIONAL_NICE_DELAYSLOT;
if (!likely && delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_NICE);
bool delaySlotIsNice = GetOutReg(delaySlotOp) != rs;
if (!delaySlotIsNice)
{
// ERROR_LOG(CPU, "Not nice delay slot in BranchRSZeroComp :( %08x", js.compilerPC);
}
gpr.MapReg(rs);
CMP(gpr.R(rs), Operand2(0, TYPE_IMM));
FlushAll();
ArmGen::FixupBranch ptr;
js.inDelaySlot = true;
if (!likely)
{
// preserve flag around the delay slot! Maybe this is not always necessary on ARM where
// we can (mostly) control whether we set the flag or not. Of course, if someone puts an slt in to the
// delay slot, we're screwed.
MRS(R8); // Save flags register
CompileAt(js.compilerPC + 4);
FlushAll();
_MSR(true, false, R8); // Restore flags register
if (!delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_SAFE_FLUSH);
ptr = B_CC(cc);
}
else
{
ptr = B_CC(cc);
CompileAt(js.compilerPC + 4);
FlushAll();
CompileDelaySlot(DELAYSLOT_FLUSH);
}
js.inDelaySlot = false;
// Take the branch
if (andLink)
@ -165,6 +149,7 @@ void Jit::BranchRSZeroComp(u32 op, ArmGen::CCFlags cc, bool andLink, bool likely
ARMABI_MOVI2R(R0, js.compilerPC + 8);
STR(R1, R0);
}
WriteExit(targetAddr, 0);
SetJumpTarget(ptr);
@ -228,35 +213,28 @@ void Jit::BranchFPFlag(u32 op, ArmGen::CCFlags cc, bool likely)
u32 targetAddr = js.compilerPC + offset + 4;
u32 delaySlotOp = Memory::ReadUnchecked_U32(js.compilerPC + 4);
bool delaySlotIsNice = IsDelaySlotNiceFPU(op, delaySlotOp);
CONDITIONAL_NICE_DELAYSLOT;
if (!likely && delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_NICE);
bool delaySlotIsNice = IsDelaySlotNice(op, delaySlotOp);
if (!delaySlotIsNice)
{
//ERROR_LOG(CPU, "Not nice delay slot in BranchFPFlag :(");
}
FlushAll();
FlushAll();
LDR(R0, R10, offsetof(MIPSState, fpcond));
TST(R0, Operand2(1, TYPE_IMM));
ArmGen::FixupBranch ptr;
js.inDelaySlot = true;
if (!likely)
{
MRS(R8); // Save flags register
CompileAt(js.compilerPC + 4);
FlushAll();
_MSR(true, false, R8); // Restore flags register
if (!delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_SAFE_FLUSH);
ptr = B_CC(cc);
}
else
{
ptr = B_CC(cc);
CompileAt(js.compilerPC + 4);
FlushAll();
CompileDelaySlot(DELAYSLOT_FLUSH);
}
js.inDelaySlot = false;
// Take the branch
WriteExit(targetAddr, 0);
@ -290,36 +268,31 @@ void Jit::BranchVFPUFlag(u32 op, ArmGen::CCFlags cc, bool likely)
u32 delaySlotOp = Memory::ReadUnchecked_U32(js.compilerPC + 4);
bool delaySlotIsNice = IsDelaySlotNice(op, delaySlotOp);
if (!delaySlotIsNice)
{
//ERROR_LOG(CPU, "Not nice delay slot in BranchFPFlag :(");
}
bool delaySlotIsNice = IsDelaySlotNiceVFPU(op, delaySlotOp);
CONDITIONAL_NICE_DELAYSLOT;
if (!likely && delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_NICE);
FlushAll();
int imm3 = (op >> 18) & 7;
ARMABI_MOVI2R(R0, (u32)&(mips_->vfpuCtrl[VFPU_CTRL_CC]));
LDR(R0, R0, Operand2(0, TYPE_IMM));
int imm3 = (op >> 18) & 7;
TST(R0, Operand2(1 << imm3, TYPE_IMM));
ArmGen::FixupBranch ptr;
js.inDelaySlot = true;
if (!likely)
{
MRS(R8); // Save flags register
CompileAt(js.compilerPC + 4);
FlushAll();
_MSR(true, false, R8); // Restore flags register
if (!delaySlotIsNice)
CompileDelaySlot(DELAYSLOT_SAFE_FLUSH);
ptr = B_CC(cc);
}
else
{
ptr = B_CC(cc);
CompileAt(js.compilerPC + 4);
FlushAll();
CompileDelaySlot(DELAYSLOT_FLUSH);
}
js.inDelaySlot = false;
@ -356,8 +329,7 @@ void Jit::Comp_Jump(u32 op)
}
u32 off = ((op & 0x03FFFFFF) << 2);
u32 targetAddr = (js.compilerPC & 0xF0000000) | off;
// Delay slot
CompileAt(js.compilerPC + 4);
CompileDelaySlot(DELAYSLOT_NICE);
FlushAll();
switch (op >> 26)
@ -389,27 +361,26 @@ void Jit::Comp_JumpReg(u32 op)
int rs = _RS;
u32 delaySlotOp = Memory::ReadUnchecked_U32(js.compilerPC + 4);
bool delaySlotIsNice = GetOutReg(delaySlotOp) != rs;
// Do what with that information?
delaySlotIsNice = false;
bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs);
CONDITIONAL_NICE_DELAYSLOT;
if (delaySlotIsNice) {
CompileAt(js.compilerPC + 4);
if (IsSyscall(delaySlotOp)) {
gpr.MapReg(rs);
MOV(R8, gpr.R(rs));
MovToPC(R8); // For syscall to be able to return.
CompileDelaySlot(DELAYSLOT_FLUSH);
return; // Syscall wrote exit code.
} else if (delaySlotIsNice) {
CompileDelaySlot(DELAYSLOT_NICE);
gpr.MapReg(rs);
MOV(R8, gpr.R(rs)); // Save the destination address through the delay slot. Could use isNice to avoid when the jit is fully implemented
FlushAll();
MovToPC(R8); // For syscall to be able to return. Could be avoided with some checking.
} else {
// Delay slot
gpr.MapReg(rs);
MOV(R8, gpr.R(rs)); // Save the destination address through the delay slot. Could use isNice to avoid when the jit is fully implemented
MovToPC(R8); // For syscall to be able to return. Could be avoided with some checking.
CompileAt(js.compilerPC + 4);
CompileDelaySlot(DELAYSLOT_NICE);
FlushAll();
if (!js.compiling) {
// INFO_LOG(HLE, "Syscall in delay slot!");
return;
}
}
switch (op & 0x3f)
@ -435,6 +406,11 @@ void Jit::Comp_Syscall(u32 op)
{
FlushAll();
// If we're in a delay slot, this is off by one.
const int offset = js.inDelaySlot ? -1 : 0;
WriteDownCount(offset);
js.downcountAmount = -offset;
ARMABI_MOVI2R(R0, op);
QuickCallFunction(R1, (void *)&CallSyscall);

View File

@ -88,6 +88,25 @@ void Jit::CompileAt(u32 addr)
MIPSCompileOp(op);
}
void Jit::CompileDelaySlot(int flags)
{
// preserve flag around the delay slot! Maybe this is not always necessary on ARM where
// we can (mostly) control whether we set the flag or not. Of course, if someone puts an slt in to the
// delay slot, we're screwed.
if (flags & DELAYSLOT_SAFE)
MRS(R8); // Save flags register. R8 is preserved through function calls and is not allocated.
js.inDelaySlot = true;
u32 op = Memory::Read_Instruction(js.compilerPC + 4);
MIPSCompileOp(op);
js.inDelaySlot = false;
if (flags & DELAYSLOT_FLUSH)
FlushAll();
if (flags & DELAYSLOT_SAFE)
_MSR(true, false, R8); // Restore flags register
}
void Jit::Compile(u32 em_address)
{
if (GetSpaceLeft() < 0x10000 || blocks.IsFull())
@ -209,18 +228,19 @@ void Jit::MovToPC(ARMReg r) {
STR(R10, r, offsetof(MIPSState, pc));
}
void Jit::DoDownCount()
void Jit::WriteDownCount(int offset)
{
int theDowncount = js.downcountAmount + offset;
LDR(R1, R10, offsetof(MIPSState, downcount));
Operand2 op2;
if (TryMakeOperand2(js.downcountAmount, op2)) // We can enlarge this if we used rotations
if (TryMakeOperand2(theDowncount, op2)) // We can enlarge this if we used rotations
{
SUBS(R1, R1, op2);
STR(R10, R1, offsetof(MIPSState, downcount));
} else {
// Should be fine to use R2 here, flushed the regcache anyway.
// If js.downcountAmount can be expressed as an Imm8, we don't need this anyway.
ARMABI_MOVI2R(R2, js.downcountAmount);
ARMABI_MOVI2R(R2, theDowncount);
SUBS(R1, R1, R2);
STR(R10, R1, offsetof(MIPSState, downcount));
}
@ -232,7 +252,7 @@ void Jit::DoDownCount()
// I don't think this gives us that much benefit.
void Jit::WriteExit(u32 destination, int exit_num)
{
DoDownCount();
WriteDownCount();
//If nobody has taken care of this yet (this can be removed when all branches are done)
ArmJitBlock *b = js.curBlock;
b->exitAddress[exit_num] = destination;
@ -254,17 +274,22 @@ void Jit::WriteExit(u32 destination, int exit_num)
void Jit::WriteExitDestInR(ARMReg Reg)
{
MovToPC(Reg);
DoDownCount();
WriteDownCount();
// TODO: shouldn't need an indirect branch here...
B((const void *)dispatcher);
}
void Jit::WriteSyscallExit()
{
DoDownCount();
WriteDownCount();
B((const void *)dispatcherCheckCoreState);
}
void Jit::LogBlockNumber()
{
INFO_LOG(CPU, "Block number: %i", blocks.GetNumBlocks() - 1);
}
#define _RS ((op>>21) & 0x1F)
#define _RT ((op>>16) & 0x1F)

View File

@ -47,6 +47,19 @@ struct ArmJitState
ArmJitBlock *curBlock;
};
enum CompileDelaySlotFlags
{
// Easy, nothing extra.
DELAYSLOT_NICE = 0,
// Flush registers after delay slot.
DELAYSLOT_FLUSH = 1,
// Preserve flags.
DELAYSLOT_SAFE = 2,
// Flush registers after and preserve flags.
DELAYSLOT_SAFE_FLUSH = DELAYSLOT_FLUSH | DELAYSLOT_SAFE,
};
class Jit : public ArmGen::ARMXCodeBlock
{
public:
@ -62,6 +75,7 @@ public:
void Compile(u32 em_address); // Compiles a block at current MIPS PC
const u8 *DoJit(u32 em_address, ArmJitBlock *b);
void CompileDelaySlot(int flags);
void CompileAt(u32 addr);
void Comp_RunBlock(u32 op);
@ -99,8 +113,7 @@ private:
void GenerateFixedCode();
void FlushAll();
// TODO: Split into two parts, the first part can be shared in branches.
void DoDownCount();
void WriteDownCount(int offset = 0);
void MovFromPC(ARMReg r);
void MovToPC(ARMReg r);
@ -117,6 +130,8 @@ private:
// Utilities to reduce duplicated code
void CompImmLogic(int rs, int rt, u32 uimm, void (ARMXEmitter::*arith)(ARMReg dst, ARMReg src, Operand2 op2), u32 (*eval)(u32 a, u32 b));
void CompShiftImm(u32 op, ArmGen::ShiftType shiftType);
void LogBlockNumber();
/*
void CompImmLogic(u32 op, void (ARMXEmitter::*arith)(int, const OpArg &, const OpArg &));
void CompTriArith(u32 op, void (ARMXEmitter::*arith)(int, const OpArg &, const OpArg &));

View File

@ -79,20 +79,6 @@ typedef void (*CompiledCode)();
class ArmJitBlockCache
{
MIPSState *mips;
const u8 **blockCodePointers;
ArmJitBlock *blocks;
int num_blocks;
std::multimap<u32, int> links_to;
std::map<std::pair<u32,u32>, u32> block_map; // (end_addr, start_addr) -> number
int MAX_NUM_BLOCKS;
bool RangeIntersect(int s1, int e1, int s2, int e2) const;
void LinkBlockExits(int i);
void LinkBlock(int i);
void UnlinkBlock(int i);
public:
ArmJitBlockCache(MIPSState *mips_) :
mips(mips_), blockCodePointers(0), blocks(0), num_blocks(0),
@ -134,4 +120,19 @@ public:
// Not currently used
//void DestroyBlocksWithFlag(BlockFlag death_flag);
private:
MIPSState *mips;
const u8 **blockCodePointers;
ArmJitBlock *blocks;
int num_blocks;
std::multimap<u32, int> links_to;
std::map<std::pair<u32,u32>, u32> block_map; // (end_addr, start_addr) -> number
int MAX_NUM_BLOCKS;
bool RangeIntersect(int s1, int e1, int s2, int e2) const;
void LinkBlockExits(int i);
void LinkBlock(int i);
void UnlinkBlock(int i);
};