From 3b1a4739bc02942fe4995ede8908bb625a4226f1 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Fri, 3 Oct 2025 20:32:27 +0200
Subject: [PATCH] JitArm64: Special-case fmadds with single-precision inputs

If all inputs to an fmadds instruction (including cousins like fmsubs,
fnmadd...) are single-precision, then the result is identical between a
double-precision calculation with an error-free transform (whether the
calculation is fused or not) and a single-precision FMA instruction
(must be fused). So as a performance optimization in JitArm64, if we
were going to use double precision with EFT but the inputs are singles,
instead we'll use a normal single-precision FMA instruction without
anything extra. This lets us skip both the EFT and double-to-single
conversions.

Also renaming `inaccurate_fma` to `nonfused` because it's confusing that
`inaccurate_fma` and `m_accurate_fmadds` have such similar names
despite controlling separate things.
---
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 37 +++++++------
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 54 ++++++++++---------
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index a2b3a0900f..f2ac356fc5 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -82,7 +82,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   const bool negate_b = op5 == 28 || op5 == 30;
 
   const bool output_is_single = inst.OPCD == 59;
-  const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool error_free_transformation_requested = fma && m_accurate_fmadds;
   const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c];
 
   const auto inputs_are_singles_func = [&] {
@@ -90,14 +91,18 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
            (!use_c || fpr.IsSingle(c, true));
   };
 
-  const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma;
+  const bool single = inputs_are_singles_func() && output_is_single &&
+                      (error_free_transformation_requested || !nonfused_requested);
   const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
   const RegType type_out = output_is_single ?
                                (single ? RegType::DuplicatedSingle : RegType::Duplicated) :
                                RegType::LowerPair;
   const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
 
-  const bool error_free_transformation = fma && !single && output_is_single && m_accurate_fmadds;
+  const bool nonfused = nonfused_requested && !single;
+  const bool error_free_transformation =
+      error_free_transformation_requested && !single && output_is_single;
+
   if (error_free_transformation)
   {
     gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
@@ -120,13 +125,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     }
 
     ARM64Reg result_reg = VD;
-    ARM64Reg inaccurate_fma_reg = VD;
+    ARM64Reg nonfused_reg = VD;
     if (error_free_transformation)
     {
       result_reg = reg_encoder(ARM64Reg::Q0);
-      inaccurate_fma_reg = reg_encoder(ARM64Reg::Q0);
+      nonfused_reg = reg_encoder(ARM64Reg::Q0);
 
-      if (inaccurate_fma && V0Q == ARM64Reg::INVALID_REG)
+      if (nonfused && V0Q == ARM64Reg::INVALID_REG)
         V0Q = fpr.GetScopedReg();
     }
     else
@@ -138,13 +143,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
         if (V0Q == ARM64Reg::INVALID_REG)
           V0Q = fpr.GetScopedReg();
         result_reg = reg_encoder(V0Q);
-        inaccurate_fma_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
       }
-      else if (fma && inaccurate_fma && VD == VB)
+      else if (fma && nonfused && VD == VB)
       {
         if (V0Q == ARM64Reg::INVALID_REG)
           V0Q = fpr.GetScopedReg();
-        inaccurate_fma_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
       }
     }
 
@@ -174,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
     case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
     case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FSUB(result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -186,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       break;
     case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
     case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FADD(result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -269,7 +274,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1);
 
       // da := a - a'
-      if (inaccurate_fma)
+      if (nonfused)
       {
         m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg);
         m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1);
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index 84ef4ef182..88ba86c2af 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -94,19 +94,23 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
   const bool negate_result = (op5 & ~0x1) == 30;
   const bool negate_b = op5 == 28 || op5 == 30;
 
-  const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool error_free_transformation_requested = fma && m_accurate_fmadds;
   const bool round_c = use_c && !js.op->fprIsSingle[c];
 
   const auto inputs_are_singles_func = [&] {
     return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
   };
 
-  const bool single = inputs_are_singles_func() && !inaccurate_fma;
+  const bool single =
+      inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested);
   const RegType type = single ? RegType::Single : RegType::Register;
   const u8 size = single ? 32 : 64;
   const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;
 
-  const bool error_free_transformation = fma && !single && m_accurate_fmadds;
+  const bool nonfused = nonfused_requested && !single;
+  const bool error_free_transformation = error_free_transformation_requested && !single;
+
   if (error_free_transformation)
   {
     gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
@@ -139,36 +143,36 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
     }
 
     ARM64Reg result_reg = VD;
-    ARM64Reg inaccurate_fma_reg = VD;
+    ARM64Reg nonfused_reg = VD;
     if (error_free_transformation)
     {
       result_reg = reg_encoder(ARM64Reg::Q0);
-      inaccurate_fma_reg = reg_encoder(ARM64Reg::Q0);
+      nonfused_reg = reg_encoder(ARM64Reg::Q0);
     }
     else
     {
-      const bool need_accurate_fma_reg =
-          fma && !inaccurate_fma && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
+      const bool need_fused_fma_reg =
+          fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
       const bool preserve_d =
           m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
-      if (need_accurate_fma_reg || preserve_d)
+      if (need_fused_fma_reg || preserve_d)
       {
         if (V0Q == ARM64Reg::INVALID_REG)
           V0Q = fpr.GetScopedReg();
         result_reg = reg_encoder(V0Q);
-        inaccurate_fma_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
 
-        if (need_accurate_fma_reg && round_c)
+        if (need_fused_fma_reg && round_c)
         {
           V1Q = fpr.GetScopedReg();
           rounded_c_reg = reg_encoder(V1Q);
         }
       }
-      else if (fma && inaccurate_fma && VD == VB)
+      else if (fma && nonfused && VD == VB)
       {
         if (V0Q == ARM64Reg::INVALID_REG)
           V0Q = fpr.GetScopedReg();
-        inaccurate_fma_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
       }
     }
 
@@ -206,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
       break;
     case 14:  // ps_madds0: d = a * c.ps0 + b
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -219,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       }
       break;
     case 15:  // ps_madds1: d = a * c.ps1 + b
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -245,10 +249,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       break;
     case 28:  // ps_msub:  d = a * c - b
     case 30:  // ps_nmsub: d = -(a * c - b)
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FSUB(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -263,10 +267,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       break;
     case 29:  // ps_madd:  d = a * c + b
     case 31:  // ps_nmadd: d = -(a * c + b)
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -307,7 +311,7 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
 
       // da := a - a'
       // (Transformed into da := a + -a')
-      if (inaccurate_fma)
+      if (nonfused)
       {
         switch (op5)
         {