From ccd8233ea378b5737e4c7d086d4d762f8645951c Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 13 Jun 2021 16:03:47 +0200 Subject: [PATCH] Jit64: Fix FPRF handling of denormal singles --- Source/Core/Core/PowerPC/Jit64/Jit.h | 5 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 76 +++++++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 12 +- .../Core/PowerPC/Jit64Common/EmuCodeBlock.cpp | 110 +++++++++--------- .../Core/PowerPC/Jit64Common/EmuCodeBlock.h | 4 +- 5 files changed, 128 insertions(+), 79 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 029cef0974..3480c85279 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -121,8 +121,11 @@ public: // Generates a branch that will check if a given bit of a CR register part // is set or not. Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); - void SetFPRFIfNeeded(Gen::X64Reg xmm); + void SetFPRFIfNeeded(const Gen::OpArg& xmm, bool single); + void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true, + bool duplicate = false); + void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in, Gen::X64Reg clobber = Gen::XMM0); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index d4ae8ca797..957a0c461f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -33,13 +33,63 @@ alignas(16) static const double half_qnan_and_s32_max[2] = {0x7FFFFFFF, -0x80000 // We can avoid calculating FPRF if it's not needed; every float operation resets it, so // if it's going to be clobbered in a future instruction before being read, we can just // not calculate it. -void Jit64::SetFPRFIfNeeded(X64Reg xmm) +void Jit64::SetFPRFIfNeeded(const OpArg& input, bool single) { // As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but // FPRF is fast enough in JIT that we might as well just enable it for every float instruction // if the FPRF flag is set. - if (SConfig::GetInstance().bFPRF && js.op->wantsFPRF) - SetFPRF(xmm); + if (!SConfig::GetInstance().bFPRF || !js.op->wantsFPRF) + return; + + X64Reg xmm = XMM0; + if (input.IsSimpleReg()) + xmm = input.GetSimpleReg(); + else + MOVSD(xmm, input); + + SetFPRF(xmm, single); +} + +void Jit64::FinalizeSingleResult(X64Reg output, const OpArg& input, bool packed, bool duplicate) +{ + // Most games don't need these. Zelda requires it though - some platforms get stuck without them. + if (jo.accurateSinglePrecision) + { + if (packed) + { + CVTPD2PS(output, input); + SetFPRFIfNeeded(R(output), true); + CVTPS2PD(output, R(output)); + } + else + { + CVTSD2SS(output, input); + SetFPRFIfNeeded(R(output), true); + CVTSS2SD(output, R(output)); + if (duplicate) + MOVDDUP(output, R(output)); + } + } + else + { + if (!input.IsSimpleReg(output)) + { + if (duplicate) + MOVDDUP(output, input); + else + MOVAPD(output, input); + } + + SetFPRFIfNeeded(input, true); + } +} + +void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input) +{ + if (!input.IsSimpleReg(output)) + MOVSD(output, input); + + SetFPRFIfNeeded(input, false); } void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Reg clobber) @@ -210,8 +260,9 @@ void Jit64::fp_arith(UGeckoInstruction inst) HandleNaNs(inst, Rd, dest); if (single) - ForceSinglePrecision(Rd, Rd, packed, true); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd, packed, true); + else + FinalizeDoubleResult(Rd, Rd); }; switch (inst.SUBOP5) @@ -452,14 +503,13 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (single) { HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1); - ForceSinglePrecision(Rd, R(result_reg), packed, true); + FinalizeSingleResult(Rd, R(result_reg), packed, true); } else { HandleNaNs(inst, result_reg, result_reg, XMM1); - MOVSD(Rd, R(result_reg)); + FinalizeDoubleResult(Rd, R(result_reg)); } - SetFPRFIfNeeded(Rd); } void Jit64::fsign(UGeckoInstruction inst) @@ -763,12 +813,11 @@ void Jit64::frspx(UGeckoInstruction inst) int d = inst.FD; bool packed = js.op->fprIsDuplicated[b] && !cpu_info.bAtom; - RCOpArg Rb = fpr.Use(b, RCMode::Read); + RCOpArg Rb = fpr.Bind(b, RCMode::Read); RCX64Reg Rd = fpr.Bind(d, RCMode::Write); RegCache::Realize(Rb, Rd); - ForceSinglePrecision(Rd, Rb, packed, true); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rb, packed, true); } void Jit64::frsqrtex(UGeckoInstruction inst) @@ -786,8 +835,7 @@ void Jit64::frsqrtex(UGeckoInstruction inst) MOVAPD(XMM0, Rb); CALL(asm_routines.frsqrte); - MOVSD(Rd, XMM0); - SetFPRFIfNeeded(Rd); + FinalizeDoubleResult(Rd, R(XMM0)); } void Jit64::fresx(UGeckoInstruction inst) @@ -806,5 +854,5 @@ void Jit64::fresx(UGeckoInstruction inst) MOVAPD(XMM0, Rb); CALL(asm_routines.fres); MOVDDUP(Rd, R(XMM0)); - SetFPRFIfNeeded(Rd); + SetFPRFIfNeeded(R(XMM0), true); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index fa5a91bd8c..d07b9e6bc0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -77,8 +77,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) PanicAlertFmt("ps_sum WTF!!!"); } HandleNaNs(inst, Rd, tmp, tmp == XMM1 ? XMM0 : XMM1); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_muls(UGeckoInstruction inst) @@ -112,8 +111,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) Force25BitPrecision(XMM1, R(XMM1), XMM0); MULPD(XMM1, Ra); HandleNaNs(inst, Rd, XMM1); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_mergeXX(UGeckoInstruction inst) @@ -171,8 +169,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst) CALL(asm_routines.frsqrte); MOVLHPS(Rd, XMM0); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_res(UGeckoInstruction inst) @@ -196,8 +193,7 @@ void Jit64::ps_res(UGeckoInstruction inst) CALL(asm_routines.fres); MOVLHPS(Rd, XMM0); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_cmpXX(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 409b158891..01a5115ba0 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -727,34 +727,6 @@ void EmuCodeBlock::JitClearCA() MOV(8, PPCSTATE(xer_ca), Imm8(0)); } -void EmuCodeBlock::ForceSinglePrecision(X64Reg output, const OpArg& input, bool packed, - bool duplicate) -{ - // Most games don't need these. Zelda requires it though - some platforms get stuck without them. - if (m_jit.jo.accurateSinglePrecision) - { - if (packed) - { - CVTPD2PS(output, input); - CVTPS2PD(output, R(output)); - } - else - { - CVTSD2SS(output, input); - CVTSS2SD(output, R(output)); - if (duplicate) - MOVDDUP(output, R(output)); - } - } - else if (!input.IsSimpleReg(output)) - { - if (duplicate) - MOVDDUP(output, input); - else - MOVAPD(output, input); - } -} - // Abstract between AVX and SSE: automatically handle 3-operand instructions void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp, @@ -907,30 +879,35 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr MOVDDUP(dst, R(dst)); } -alignas(16) static const u64 psDoubleExp[2] = {0x7FF0000000000000ULL, 0}; -alignas(16) static const u64 psDoubleFrac[2] = {0x000FFFFFFFFFFFFFULL, 0}; -alignas(16) static const u64 psDoubleNoSign[2] = {0x7FFFFFFFFFFFFFFFULL, 0}; +alignas(16) static const u64 psDoubleExp[2] = {Common::DOUBLE_EXP, 0}; +alignas(16) static const u64 psDoubleFrac[2] = {Common::DOUBLE_FRAC, 0}; +alignas(16) static const u64 psDoubleNoSign[2] = {~Common::DOUBLE_SIGN, 0}; + +alignas(16) static const u32 psFloatExp[4] = {Common::FLOAT_EXP, 0, 0, 0}; +alignas(16) static const u32 psFloatFrac[4] = {Common::FLOAT_FRAC, 0, 0, 0}; +alignas(16) static const u32 psFloatNoSign[4] = {~Common::FLOAT_SIGN, 0, 0, 0}; // TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer, -// storing -// the result of each floating point op and calculating it when needed. This is trickier than for -// integers -// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point -// values, -// which makes the whole thing rather trickier. -// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this -// isn't -// quite that necessary. -void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) +// storing the result of each floating point op and calculating it when needed. This is trickier +// than for integers though, because there's 32 possible FPRF bit combinations but only 9 categories +// of floating point values. Fortunately, PPCAnalyzer can optimize out a large portion of FPRF +// calculations, so maybe this isn't quite that necessary. +void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm, bool single) { + const int input_size = single ? 32 : 64; + AND(32, PPCSTATE(fpscr), Imm32(~FPRF_MASK)); FixupBranch continue1, continue2, continue3, continue4; if (cpu_info.bSSE4_1) { MOVQ_xmm(R(RSCRATCH), xmm); - SHR(64, R(RSCRATCH), Imm8(63)); // Get the sign bit; almost all the branches need it. - PTEST(xmm, MConst(psDoubleExp)); + // Get the sign bit; almost all the branches need it. + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); + if (single) + PTEST(xmm, MConst(psFloatExp)); + else + PTEST(xmm, MConst(psDoubleExp)); FixupBranch maxExponent = J_CC(CC_C); FixupBranch zeroExponent = J_CC(CC_Z); @@ -940,7 +917,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) continue1 = J(); SetJumpTarget(maxExponent); - PTEST(xmm, MConst(psDoubleFrac)); + if (single) + PTEST(xmm, MConst(psFloatFrac)); + else + PTEST(xmm, MConst(psDoubleFrac)); FixupBranch notNAN = J_CC(CC_Z); // Max exponent + mantissa: PPC_FPCLASS_QNAN @@ -955,7 +935,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) continue3 = J(); SetJumpTarget(zeroExponent); - PTEST(xmm, MConst(psDoubleNoSign)); + if (single) + PTEST(xmm, MConst(psFloatNoSign)); + else + PTEST(xmm, MConst(psDoubleNoSign)); FixupBranch zero = J_CC(CC_Z); // No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD; @@ -971,37 +954,58 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) else { MOVQ_xmm(R(RSCRATCH), xmm); - TEST(64, R(RSCRATCH), MConst(psDoubleExp)); + if (single) + TEST(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP)); + else + TEST(64, R(RSCRATCH), MConst(psDoubleExp)); FixupBranch zeroExponent = J_CC(CC_Z); - AND(64, R(RSCRATCH), MConst(psDoubleNoSign)); - CMP(64, R(RSCRATCH), MConst(psDoubleExp)); + + if (single) + { + AND(32, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN)); + CMP(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP)); + } + else + { + AND(64, R(RSCRATCH), MConst(psDoubleNoSign)); + CMP(64, R(RSCRATCH), MConst(psDoubleExp)); + } FixupBranch nan = J_CC(CC_G); // This works because if the sign bit is set, RSCRATCH is negative FixupBranch infinity = J_CC(CC_E); + MOVQ_xmm(R(RSCRATCH), xmm); - SHR(64, R(RSCRATCH), Imm8(63)); + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); LEA(32, RSCRATCH, MScaled(RSCRATCH, Common::PPC_FPCLASS_NN - Common::PPC_FPCLASS_PN, Common::PPC_FPCLASS_PN)); continue1 = J(); + SetJumpTarget(nan); MOV(32, R(RSCRATCH), Imm32(Common::PPC_FPCLASS_QNAN)); continue2 = J(); + SetJumpTarget(infinity); MOVQ_xmm(R(RSCRATCH), xmm); - SHR(64, R(RSCRATCH), Imm8(63)); + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); LEA(32, RSCRATCH, MScaled(RSCRATCH, Common::PPC_FPCLASS_NINF - Common::PPC_FPCLASS_PINF, Common::PPC_FPCLASS_PINF)); continue3 = J(); + SetJumpTarget(zeroExponent); - TEST(64, R(RSCRATCH), MConst(psDoubleNoSign)); + if (single) + TEST(input_size, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN)); + else + TEST(input_size, R(RSCRATCH), MConst(psDoubleNoSign)); FixupBranch zero = J_CC(CC_Z); - SHR(64, R(RSCRATCH), Imm8(63)); + + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); LEA(32, RSCRATCH, MScaled(RSCRATCH, Common::PPC_FPCLASS_ND - Common::PPC_FPCLASS_PD, Common::PPC_FPCLASS_PD)); continue4 = J(); + SetJumpTarget(zero); - SHR(64, R(RSCRATCH), Imm8(63)); + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); SHL(32, R(RSCRATCH), Imm8(4)); ADD(32, R(RSCRATCH), Imm8(Common::PPC_FPCLASS_PZ)); } diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h index b8a1aae0c9..9f5c373df3 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h @@ -117,14 +117,12 @@ public: void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp, const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm); - void ForceSinglePrecision(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true, - bool duplicate = false); void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp); // RSCRATCH might get trashed void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false); void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src); - void SetFPRF(Gen::X64Reg xmm); + void SetFPRF(Gen::X64Reg xmm, bool single); void Clear(); protected: