From ca58f322e53e1c40c69e9c7d02edd8fa6c649a82 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sun, 22 Mar 2015 00:04:50 +0100 Subject: [PATCH] ARM64: Port over some missing VFPU instructions from ARM. Not much left now. --- Common/Arm64Emitter.cpp | 4 +- Common/Arm64Emitter.h | 2 +- Common/ArmCommon.h | 5 + Core/MIPS/ARM64/Arm64CompVFPU.cpp | 270 +++++++++++++++++++++++++++++- 4 files changed, 273 insertions(+), 8 deletions(-) diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index d636ec41b1..24becd38b2 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -3280,12 +3280,12 @@ void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) } } -void ARM64XEmitter::ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { +void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { unsigned int n, imm_s, imm_r; if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) { ORR(Rd, Rn, imm_r, imm_s, n); } else { - _assert_msg_(JIT, scratch != INVALID_REG, "ORI2R - failed to construct immediate value from %08x, need scratch", (u32)imm); + _assert_msg_(JIT, scratch != INVALID_REG, "ORRI2R - failed to construct immediate value from %08x, need scratch", (u32)imm); MOVI2R(scratch, imm); ORR(Rd, Rn, scratch); } diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 28a3a277b9..c388b8ad7a 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -691,7 +691,7 @@ public: void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) { ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); } - void ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); diff --git a/Common/ArmCommon.h b/Common/ArmCommon.h index e55756016b..01968a5062 100644 --- a/Common/ArmCommon.h +++ b/Common/ArmCommon.h @@ -28,3 +28,8 @@ enum CCFlags }; const u32 NO_COND = 0xE0000000; +inline CCFlags InvertCond(CCFlags fl) { + int x = (int)fl; + x ^= 1; + return (CCFlags)x; +} \ No newline at end of file diff --git a/Core/MIPS/ARM64/Arm64CompVFPU.cpp b/Core/MIPS/ARM64/Arm64CompVFPU.cpp index 361d661edb..f1459db446 100644 --- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp +++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp @@ -1195,11 +1195,231 @@ namespace MIPSComp } void Arm64Jit::Comp_Vcmp(MIPSOpcode op) { - DISABLE; + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix()) + DISABLE; + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + VCondition cond = (VCondition)(op & 0xF); + + u8 sregs[4], tregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixT(tregs, sz, _VT); + + // Some, we just fall back to the interpreter. + // ES is just really equivalent to (value & 0x7F800000) == 0x7F800000. + + switch (cond) { + case VC_EI: // c = my_isinf(s[i]); break; + case VC_NI: // c = !my_isinf(s[i]); break; + DISABLE; + case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection + case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break; + case VC_EN: // c = my_isnan(s[i]); break; + case VC_NN: // c = !my_isnan(s[i]); break; + if (_VS != _VT) + DISABLE; + break; + + case VC_EZ: + case VC_NZ: + break; + default: + ; + } + + // First, let's get the trivial ones. + int affected_bits = (1 << 4) | (1 << 5); // 4 and 5 + + MOVI2R(SCRATCH1, 0); + for (int i = 0; i < n; ++i) { + // Let's only handle the easy ones, and fall back on the interpreter for the rest. + CCFlags flag = CC_AL; + switch (cond) { + case VC_FL: // c = 0; + break; + + case VC_TR: // c = 1 + if (i == 0) { + if (n == 1) { + MOVI2R(SCRATCH1, 0x31); + } else { + MOVI2R(SCRATCH1, 1 << i); + } + } else { + ORRI2R(SCRATCH1, SCRATCH1, 1 << i); + } + break; + + case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection + case VC_NS: // c = !(my_isnan(s[i]) || my_isinf(s[i])); break; + // For these, we use the integer ALU as there is no support on ARM for testing for INF. + // Testing for nan or inf is the same as testing for &= 0x7F800000 == 0x7F800000. + // We need an extra temporary register so we store away SCRATCH1. + STR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp)); + fpr.MapRegV(sregs[i], 0); + MOVI2R(SCRATCH1, 0x7F800000); + fp.FMOV(SCRATCH2, fpr.V(sregs[i])); + AND(SCRATCH2, SCRATCH2, SCRATCH1); + CMP(SCRATCH2, SCRATCH1); // (SCRATCH2 & 0x7F800000) == 0x7F800000 + flag = cond == VC_ES ? CC_EQ : CC_NEQ; + LDR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, temp)); + break; + + case VC_EN: // c = my_isnan(s[i]); break; // Tekken 6 + // Should we involve T? Where I found this used, it compared a register with itself so should be fine. + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_VS; // overflow = unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html + break; + + case VC_NN: // c = !my_isnan(s[i]); break; + // Should we involve T? Where I found this used, it compared a register with itself so should be fine. + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_VC; // !overflow = !unordered : http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/Chdhcfbc.html + break; + + case VC_EQ: // c = s[i] == t[i] + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_EQ; + break; + + case VC_LT: // c = s[i] < t[i] + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_LO; + break; + + case VC_LE: // c = s[i] <= t[i]; + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_LS; + break; + + case VC_NE: // c = s[i] != t[i] + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_NEQ; + break; + + case VC_GE: // c = s[i] >= t[i] + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_GE; + break; + + case VC_GT: // c = s[i] > t[i] + fpr.MapInInV(sregs[i], tregs[i]); + fp.FCMP(fpr.V(sregs[i]), fpr.V(tregs[i])); + flag = CC_GT; + break; + + case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f + fpr.MapRegV(sregs[i]); + fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0) + flag = CC_EQ; + break; + + case VC_NZ: // c = s[i] != 0 + fpr.MapRegV(sregs[i]); + fp.FCMP(fpr.V(sregs[i])); // vcmp(sregs[i], #0.0) + flag = CC_NEQ; + break; + + default: + DISABLE; + } + if (flag != CC_AL) { + FixupBranch b = B(InvertCond(flag)); + if (i == 0) { + if (n == 1) { + MOVI2R(SCRATCH1, 0x31); + } else { + MOVI2R(SCRATCH1, 1); // 1 << i, but i == 0 + } + } else { + ORRI2R(SCRATCH1, SCRATCH1, 1 << i); + } + SetJumpTarget(b); + } + + affected_bits |= 1 << i; + } + + // Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison, which is the most common + // after all. + if (n > 1) { + CMP(SCRATCH1, affected_bits & 0xF); + FixupBranch skip1 = B(CC_NEQ); + ORRI2R(SCRATCH1, SCRATCH1, 1 << 5); + SetJumpTarget(skip1); + + CMP(SCRATCH1, 0); + FixupBranch skip2 = B(CC_EQ); + ORRI2R(SCRATCH1, SCRATCH1, 1 << 4); + SetJumpTarget(skip2); + } + + gpr.MapReg(MIPS_REG_VFPUCC, MAP_DIRTY); + ANDI2R(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), ~affected_bits); + ORR(gpr.R(MIPS_REG_VFPUCC), gpr.R(MIPS_REG_VFPUCC), SCRATCH1); + + fpr.ReleaseSpillLocksAndDiscardTemps(); } void Arm64Jit::Comp_Vcmov(MIPSOpcode op) { - DISABLE; + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix()) { + DISABLE; + } + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 sregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, sz, _VD); + int tf = (op >> 19) & 1; + int imm3 = (op >> 16) & 7; + + for (int i = 0; i < n; ++i) { + // Simplification: Disable if overlap unsafe + if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) { + DISABLE; + } + } + + if (imm3 < 6) { + // Test one bit of CC. This bit decides whether none or all subregisters are copied. + fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY); + fpr.MapRegsAndSpillLockV(sregs, sz, 0); + gpr.MapReg(MIPS_REG_VFPUCC); + TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << imm3); + // TODO: Use fsel? + FixupBranch b = B(tf ? CC_NEQ : CC_EQ); + for (int i = 0; i < n; i++) { + fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i])); + } + SetJumpTarget(b); + } else { + // Look at the bottom four bits of CC to individually decide if the subregisters should be copied. + fpr.MapRegsAndSpillLockV(dregs, sz, MAP_DIRTY); + fpr.MapRegsAndSpillLockV(sregs, sz, 0); + gpr.MapReg(MIPS_REG_VFPUCC); + for (int i = 0; i < n; i++) { + TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << i); + FixupBranch b = B(tf ? CC_NEQ : CC_EQ); + fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i])); + SetJumpTarget(b); + } + } + + ApplyPrefixD(dregs, sz); + fpr.ReleaseSpillLocksAndDiscardTemps(); } void Arm64Jit::Comp_Viim(MIPSOpcode op) { @@ -1303,7 +1523,7 @@ namespace MIPSComp // Very heavily used by FF:CC. Should be replaced by a fast approximation instead of // calling the math library. void Arm64Jit::Comp_VRot(MIPSOpcode op) { - DISABLE; + DISABLE; // Need to figure out how to deal with the return values from the function. // VRot probably doesn't accept prefixes anyway. CONDITIONAL_DISABLE; @@ -1343,7 +1563,6 @@ namespace MIPSComp bool negSin1 = (imm & 0x10) ? true : false; fpr.MapRegV(sreg); - // We should write a custom pure-asm function instead. fp.FMOV(S0, fpr.V(sreg)); QuickCallFunction(SCRATCH2_64, negSin1 ? (void *)&SinCosNegSin : (void *)&SinCos); CompVrotShuffle(dregs, imm, sz, false); @@ -1362,7 +1581,48 @@ namespace MIPSComp } void Arm64Jit::Comp_Vocp(MIPSOpcode op) { - DISABLE; + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix()) { + DISABLE; + } + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 sregs[4], dregs[4]; + // Actually, not sure that this instruction accepts an S prefix. We don't apply it in the + // interpreter. But whatever. + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, sz, _VD); + + MIPSReg tempregs[4]; + for (int i = 0; i < n; ++i) { + if (!IsOverlapSafe(dregs[i], i, n, sregs)) { + tempregs[i] = fpr.GetTempV(); + } else { + tempregs[i] = dregs[i]; + } + } + + fp.MOVI2F(S0, 1.0f, SCRATCH1); + for (int i = 0; i < n; ++i) { + fpr.MapDirtyInV(tempregs[i], sregs[i]); + // Let's do it integer registers for now. NEON later. + // There's gotta be a shorter way, can't find one though that takes + // care of NaNs like the interpreter (ignores them and just operates on the bits). + fp.FSUB(fpr.V(tempregs[i]), S0, fpr.V(sregs[i])); + } + + for (int i = 0; i < n; ++i) { + if (dregs[i] != tempregs[i]) { + fpr.MapDirtyInV(dregs[i], tempregs[i]); + fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i])); + } + } + + ApplyPrefixD(dregs, sz); + + fpr.ReleaseSpillLocksAndDiscardTemps(); } void Arm64Jit::Comp_ColorConv(MIPSOpcode op) {