From ccd8233ea378b5737e4c7d086d4d762f8645951c Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 13 Jun 2021 16:03:47 +0200
Subject: [PATCH] Jit64: Fix FPRF handling of denormal singles

---
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   5 +-
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  |  76 +++++++++---
 Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp |  12 +-
 .../Core/PowerPC/Jit64Common/EmuCodeBlock.cpp | 110 +++++++++---------
 .../Core/PowerPC/Jit64Common/EmuCodeBlock.h   |   4 +-
 5 files changed, 128 insertions(+), 79 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 029cef0974..3480c85279 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -121,8 +121,11 @@ public:
   // Generates a branch that will check if a given bit of a CR register part
   // is set or not.
   Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
-  void SetFPRFIfNeeded(Gen::X64Reg xmm);
 
+  void SetFPRFIfNeeded(const Gen::OpArg& xmm, bool single);
+  void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
+                            bool duplicate = false);
+  void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
   void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
                   Gen::X64Reg clobber = Gen::XMM0);
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index d4ae8ca797..957a0c461f 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -33,13 +33,63 @@ alignas(16) static const double half_qnan_and_s32_max[2] = {0x7FFFFFFF, -0x80000
 // We can avoid calculating FPRF if it's not needed; every float operation resets it, so
 // if it's going to be clobbered in a future instruction before being read, we can just
 // not calculate it.
-void Jit64::SetFPRFIfNeeded(X64Reg xmm)
+void Jit64::SetFPRFIfNeeded(const OpArg& input, bool single)
 {
   // As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but
   // FPRF is fast enough in JIT that we might as well just enable it for every float instruction
   // if the FPRF flag is set.
-  if (SConfig::GetInstance().bFPRF && js.op->wantsFPRF)
-    SetFPRF(xmm);
+  if (!SConfig::GetInstance().bFPRF || !js.op->wantsFPRF)
+    return;
+
+  X64Reg xmm = XMM0;
+  if (input.IsSimpleReg())
+    xmm = input.GetSimpleReg();
+  else
+    MOVSD(xmm, input);
+
+  SetFPRF(xmm, single);
+}
+
+void Jit64::FinalizeSingleResult(X64Reg output, const OpArg& input, bool packed, bool duplicate)
+{
+  // Most games don't need these. Zelda requires it though - some platforms get stuck without them.
+  if (jo.accurateSinglePrecision)
+  {
+    if (packed)
+    {
+      CVTPD2PS(output, input);
+      SetFPRFIfNeeded(R(output), true);
+      CVTPS2PD(output, R(output));
+    }
+    else
+    {
+      CVTSD2SS(output, input);
+      SetFPRFIfNeeded(R(output), true);
+      CVTSS2SD(output, R(output));
+      if (duplicate)
+        MOVDDUP(output, R(output));
+    }
+  }
+  else
+  {
+    if (!input.IsSimpleReg(output))
+    {
+      if (duplicate)
+        MOVDDUP(output, input);
+      else
+        MOVAPD(output, input);
+    }
+
+    SetFPRFIfNeeded(input, true);
+  }
+}
+
+void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input)
+{
+  if (!input.IsSimpleReg(output))
+    MOVSD(output, input);
+
+  SetFPRFIfNeeded(input, false);
 }
 
 void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Reg clobber)
@@ -210,8 +260,9 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 
     HandleNaNs(inst, Rd, dest);
     if (single)
-      ForceSinglePrecision(Rd, Rd, packed, true);
-    SetFPRFIfNeeded(Rd);
+      FinalizeSingleResult(Rd, Rd, packed, true);
+    else
+      FinalizeDoubleResult(Rd, Rd);
   };
 
   switch (inst.SUBOP5)
@@ -452,14 +503,13 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   if (single)
   {
     HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
-    ForceSinglePrecision(Rd, R(result_reg), packed, true);
+    FinalizeSingleResult(Rd, R(result_reg), packed, true);
   }
   else
   {
     HandleNaNs(inst, result_reg, result_reg, XMM1);
-    MOVSD(Rd, R(result_reg));
+    FinalizeDoubleResult(Rd, R(result_reg));
   }
-  SetFPRFIfNeeded(Rd);
 }
 
 void Jit64::fsign(UGeckoInstruction inst)
@@ -763,12 +813,11 @@ void Jit64::frspx(UGeckoInstruction inst)
   int d = inst.FD;
   bool packed = js.op->fprIsDuplicated[b] && !cpu_info.bAtom;
 
-  RCOpArg Rb = fpr.Use(b, RCMode::Read);
+  RCOpArg Rb = fpr.Bind(b, RCMode::Read);
   RCX64Reg Rd = fpr.Bind(d, RCMode::Write);
   RegCache::Realize(Rb, Rd);
 
-  ForceSinglePrecision(Rd, Rb, packed, true);
-  SetFPRFIfNeeded(Rd);
+  FinalizeSingleResult(Rd, Rb, packed, true);
 }
 
 void Jit64::frsqrtex(UGeckoInstruction inst)
@@ -786,8 +835,7 @@ void Jit64::frsqrtex(UGeckoInstruction inst)
 
   MOVAPD(XMM0, Rb);
   CALL(asm_routines.frsqrte);
-  MOVSD(Rd, XMM0);
-  SetFPRFIfNeeded(Rd);
+  FinalizeDoubleResult(Rd, R(XMM0));
 }
 
 void Jit64::fresx(UGeckoInstruction inst)
@@ -806,5 +854,5 @@ void Jit64::fresx(UGeckoInstruction inst)
   MOVAPD(XMM0, Rb);
   CALL(asm_routines.fres);
   MOVDDUP(Rd, R(XMM0));
-  SetFPRFIfNeeded(Rd);
+  SetFPRFIfNeeded(R(XMM0), true);
 }
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
index fa5a91bd8c..d07b9e6bc0 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@@ -77,8 +77,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
     PanicAlertFmt("ps_sum WTF!!!");
   }
   HandleNaNs(inst, Rd, tmp, tmp == XMM1 ? XMM0 : XMM1);
-  ForceSinglePrecision(Rd, Rd);
-  SetFPRFIfNeeded(Rd);
+  FinalizeSingleResult(Rd, Rd);
 }
 
 void Jit64::ps_muls(UGeckoInstruction inst)
@@ -112,8 +111,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
     Force25BitPrecision(XMM1, R(XMM1), XMM0);
   MULPD(XMM1, Ra);
   HandleNaNs(inst, Rd, XMM1);
-  ForceSinglePrecision(Rd, Rd);
-  SetFPRFIfNeeded(Rd);
+  FinalizeSingleResult(Rd, Rd);
 }
 
 void Jit64::ps_mergeXX(UGeckoInstruction inst)
@@ -171,8 +169,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst)
   CALL(asm_routines.frsqrte);
   MOVLHPS(Rd, XMM0);
 
-  ForceSinglePrecision(Rd, Rd);
-  SetFPRFIfNeeded(Rd);
+  FinalizeSingleResult(Rd, Rd);
 }
 
 void Jit64::ps_res(UGeckoInstruction inst)
@@ -196,8 +193,7 @@ void Jit64::ps_res(UGeckoInstruction inst)
   CALL(asm_routines.fres);
   MOVLHPS(Rd, XMM0);
 
-  ForceSinglePrecision(Rd, Rd);
-  SetFPRFIfNeeded(Rd);
+  FinalizeSingleResult(Rd, Rd);
 }
 
 void Jit64::ps_cmpXX(UGeckoInstruction inst)
diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
index 409b158891..01a5115ba0 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@@ -727,34 +727,6 @@ void EmuCodeBlock::JitClearCA()
   MOV(8, PPCSTATE(xer_ca), Imm8(0));
 }
 
-void EmuCodeBlock::ForceSinglePrecision(X64Reg output, const OpArg& input, bool packed,
-                                        bool duplicate)
-{
-  // Most games don't need these. Zelda requires it though - some platforms get stuck without them.
-  if (m_jit.jo.accurateSinglePrecision)
-  {
-    if (packed)
-    {
-      CVTPD2PS(output, input);
-      CVTPS2PD(output, R(output));
-    }
-    else
-    {
-      CVTSD2SS(output, input);
-      CVTSS2SD(output, R(output));
-      if (duplicate)
-        MOVDDUP(output, R(output));
-    }
-  }
-  else if (!input.IsSimpleReg(output))
-  {
-    if (duplicate)
-      MOVDDUP(output, input);
-    else
-      MOVAPD(output, input);
-  }
-}
-
 // Abstract between AVX and SSE: automatically handle 3-operand instructions
 void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
                           void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp,
@@ -907,30 +879,35 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
   MOVDDUP(dst, R(dst));
 }
 
-alignas(16) static const u64 psDoubleExp[2] = {0x7FF0000000000000ULL, 0};
-alignas(16) static const u64 psDoubleFrac[2] = {0x000FFFFFFFFFFFFFULL, 0};
-alignas(16) static const u64 psDoubleNoSign[2] = {0x7FFFFFFFFFFFFFFFULL, 0};
+alignas(16) static const u64 psDoubleExp[2] = {Common::DOUBLE_EXP, 0};
+alignas(16) static const u64 psDoubleFrac[2] = {Common::DOUBLE_FRAC, 0};
+alignas(16) static const u64 psDoubleNoSign[2] = {~Common::DOUBLE_SIGN, 0};
+
+alignas(16) static const u32 psFloatExp[4] = {Common::FLOAT_EXP, 0, 0, 0};
+alignas(16) static const u32 psFloatFrac[4] = {Common::FLOAT_FRAC, 0, 0, 0};
+alignas(16) static const u32 psFloatNoSign[4] = {~Common::FLOAT_SIGN, 0, 0, 0};
 
 // TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer,
-// storing
-// the result of each floating point op and calculating it when needed. This is trickier than for
-// integers
-// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point
-// values,
-// which makes the whole thing rather trickier.
-// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this
-// isn't
-// quite that necessary.
-void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
+// storing the result of each floating point op and calculating it when needed. This is trickier
+// than for integers though, because there's 32 possible FPRF bit combinations but only 9 categories
+// of floating point values. Fortunately, PPCAnalyzer can optimize out a large portion of FPRF
+// calculations, so maybe this isn't quite that necessary.
+void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm, bool single)
 {
+  const int input_size = single ? 32 : 64;
+
   AND(32, PPCSTATE(fpscr), Imm32(~FPRF_MASK));
 
   FixupBranch continue1, continue2, continue3, continue4;
   if (cpu_info.bSSE4_1)
   {
     MOVQ_xmm(R(RSCRATCH), xmm);
-    SHR(64, R(RSCRATCH), Imm8(63));  // Get the sign bit; almost all the branches need it.
-    PTEST(xmm, MConst(psDoubleExp));
+    // Get the sign bit; almost all the branches need it.
+    SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
+    if (single)
+      PTEST(xmm, MConst(psFloatExp));
+    else
+      PTEST(xmm, MConst(psDoubleExp));
     FixupBranch maxExponent = J_CC(CC_C);
     FixupBranch zeroExponent = J_CC(CC_Z);
 
@@ -940,7 +917,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
     continue1 = J();
 
     SetJumpTarget(maxExponent);
-    PTEST(xmm, MConst(psDoubleFrac));
+    if (single)
+      PTEST(xmm, MConst(psFloatFrac));
+    else
+      PTEST(xmm, MConst(psDoubleFrac));
     FixupBranch notNAN = J_CC(CC_Z);
 
     // Max exponent + mantissa: PPC_FPCLASS_QNAN
@@ -955,7 +935,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
     continue3 = J();
 
     SetJumpTarget(zeroExponent);
-    PTEST(xmm, MConst(psDoubleNoSign));
+    if (single)
+      PTEST(xmm, MConst(psFloatNoSign));
+    else
+      PTEST(xmm, MConst(psDoubleNoSign));
     FixupBranch zero = J_CC(CC_Z);
 
     // No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD;
@@ -971,37 +954,58 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
   else
   {
     MOVQ_xmm(R(RSCRATCH), xmm);
-    TEST(64, R(RSCRATCH), MConst(psDoubleExp));
+    if (single)
+      TEST(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP));
+    else
+      TEST(64, R(RSCRATCH), MConst(psDoubleExp));
     FixupBranch zeroExponent = J_CC(CC_Z);
-    AND(64, R(RSCRATCH), MConst(psDoubleNoSign));
-    CMP(64, R(RSCRATCH), MConst(psDoubleExp));
+
+    if (single)
+    {
+      AND(32, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN));
+      CMP(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP));
+    }
+    else
+    {
+      AND(64, R(RSCRATCH), MConst(psDoubleNoSign));
+      CMP(64, R(RSCRATCH), MConst(psDoubleExp));
+    }
     FixupBranch nan =
         J_CC(CC_G);  // This works because if the sign bit is set, RSCRATCH is negative
     FixupBranch infinity = J_CC(CC_E);
+
     MOVQ_xmm(R(RSCRATCH), xmm);
-    SHR(64, R(RSCRATCH), Imm8(63));
+    SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
     LEA(32, RSCRATCH,
         MScaled(RSCRATCH, Common::PPC_FPCLASS_NN - Common::PPC_FPCLASS_PN, Common::PPC_FPCLASS_PN));
     continue1 = J();
+
     SetJumpTarget(nan);
     MOV(32, R(RSCRATCH), Imm32(Common::PPC_FPCLASS_QNAN));
     continue2 = J();
+
     SetJumpTarget(infinity);
     MOVQ_xmm(R(RSCRATCH), xmm);
-    SHR(64, R(RSCRATCH), Imm8(63));
+    SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
     LEA(32, RSCRATCH,
         MScaled(RSCRATCH, Common::PPC_FPCLASS_NINF - Common::PPC_FPCLASS_PINF,
                 Common::PPC_FPCLASS_PINF));
     continue3 = J();
+
     SetJumpTarget(zeroExponent);
-    TEST(64, R(RSCRATCH), MConst(psDoubleNoSign));
+    if (single)
+      TEST(input_size, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN));
+    else
+      TEST(input_size, R(RSCRATCH), MConst(psDoubleNoSign));
     FixupBranch zero = J_CC(CC_Z);
-    SHR(64, R(RSCRATCH), Imm8(63));
+
+    SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
     LEA(32, RSCRATCH,
         MScaled(RSCRATCH, Common::PPC_FPCLASS_ND - Common::PPC_FPCLASS_PD, Common::PPC_FPCLASS_PD));
     continue4 = J();
+
     SetJumpTarget(zero);
-    SHR(64, R(RSCRATCH), Imm8(63));
+    SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
     SHL(32, R(RSCRATCH), Imm8(4));
     ADD(32, R(RSCRATCH), Imm8(Common::PPC_FPCLASS_PZ));
   }
diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
index b8a1aae0c9..9f5c373df3 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
@@ -117,14 +117,12 @@ public:
               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp,
               const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm);
 
-  void ForceSinglePrecision(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
-                            bool duplicate = false);
   void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp);
 
   // RSCRATCH might get trashed
   void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
   void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
-  void SetFPRF(Gen::X64Reg xmm);
+  void SetFPRF(Gen::X64Reg xmm, bool single);
   void Clear();
 
 protected: