From 3b1a4739bc02942fe4995ede8908bb625a4226f1 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 3 Oct 2025 20:32:27 +0200 Subject: [PATCH] JitArm64: Special-case fmadds with single-precision inputs If all inputs to an fmadds instruction (including cousins like fmsubs, fnmadd...) are single-precision, then the result is identical between a double-precision calculation with an error-free transform (whether the calculation is fused or not) and a single-precision FMA instruction (must be fused). So as a performance optimization in JitArm64, if we were going to use double precision with EFT but the inputs are singles, instead we'll use a normal single-precision FMA instruction without anything extra. This lets us skip both the EFT and double-to-single conversions. Also renaming `inaccurate_fma` to `nonfused` because it's confusing that `inaccurate_fma` and `m_accurate_fmadds` have such similar names despite controlling separate things. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 37 +++++++------ .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 54 ++++++++++--------- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index a2b3a0900f..f2ac356fc5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -82,7 +82,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const bool negate_b = op5 == 28 || op5 == 30; const bool output_is_single = inst.OPCD == 59; - const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool error_free_transformation_requested = fma && m_accurate_fmadds; const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c]; const auto inputs_are_singles_func = [&] { @@ -90,14 +91,18 @@ void JitArm64::fp_arith(UGeckoInstruction inst) (!use_c || fpr.IsSingle(c, true)); }; - const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma; + const bool single = inputs_are_singles_func() && output_is_single && + (error_free_transformation_requested || !nonfused_requested); const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair; const RegType type_out = output_is_single ? (single ? RegType::DuplicatedSingle : RegType::Duplicated) : RegType::LowerPair; const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble; - const bool error_free_transformation = fma && !single && output_is_single && m_accurate_fmadds; + const bool nonfused = nonfused_requested && !single; + const bool error_free_transformation = + error_free_transformation_requested && !single && output_is_single; + if (error_free_transformation) { gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); @@ -120,13 +125,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst) } ARM64Reg result_reg = VD; - ARM64Reg inaccurate_fma_reg = VD; + ARM64Reg nonfused_reg = VD; if (error_free_transformation) { result_reg = reg_encoder(ARM64Reg::Q0); - inaccurate_fma_reg = reg_encoder(ARM64Reg::Q0); + nonfused_reg = reg_encoder(ARM64Reg::Q0); - if (inaccurate_fma && V0Q == ARM64Reg::INVALID_REG) + if (nonfused && V0Q == ARM64Reg::INVALID_REG) V0Q = fpr.GetScopedReg(); } else @@ -138,13 +143,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst) if (V0Q == ARM64Reg::INVALID_REG) V0Q = fpr.GetScopedReg(); result_reg = reg_encoder(V0Q); - inaccurate_fma_reg = reg_encoder(V0Q); + nonfused_reg = reg_encoder(V0Q); } - else if (fma && inaccurate_fma && VD == VB) + else if (fma && nonfused && VD == VB) { if (V0Q == ARM64Reg::INVALID_REG) V0Q = fpr.GetScopedReg(); - inaccurate_fma_reg = reg_encoder(V0Q); + nonfused_reg = reg_encoder(V0Q); } } @@ -174,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst) // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub. case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg); + m_float_emit.FSUB(result_reg, nonfused_reg, VB); } else { @@ -186,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst) break; case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg); + m_float_emit.FADD(result_reg, nonfused_reg, VB); } else { @@ -269,7 +274,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst) m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1); // da := a - a' - if (inaccurate_fma) + if (nonfused) { m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg); m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 84ef4ef182..88ba86c2af 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -94,19 +94,23 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const bool negate_result = (op5 & ~0x1) == 30; const bool negate_b = op5 == 28 || op5 == 30; - const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool error_free_transformation_requested = fma && m_accurate_fmadds; const bool round_c = use_c && !js.op->fprIsSingle[c]; const auto inputs_are_singles_func = [&] { return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c)); }; - const bool single = inputs_are_singles_func() && !inaccurate_fma; + const bool single = + inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested); const RegType type = single ? RegType::Single : RegType::Register; const u8 size = single ? 32 : 64; const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad; - const bool error_free_transformation = fma && !single && m_accurate_fmadds; + const bool nonfused = nonfused_requested && !single; + const bool error_free_transformation = error_free_transformation_requested && !single; + if (error_free_transformation) { gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); @@ -139,36 +143,36 @@ void JitArm64::ps_arith(UGeckoInstruction inst) } ARM64Reg result_reg = VD; - ARM64Reg inaccurate_fma_reg = VD; + ARM64Reg nonfused_reg = VD; if (error_free_transformation) { result_reg = reg_encoder(ARM64Reg::Q0); - inaccurate_fma_reg = reg_encoder(ARM64Reg::Q0); + nonfused_reg = reg_encoder(ARM64Reg::Q0); } else { - const bool need_accurate_fma_reg = - fma && !inaccurate_fma && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg); + const bool need_fused_fma_reg = + fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg); const bool preserve_d = m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); - if (need_accurate_fma_reg || preserve_d) + if (need_fused_fma_reg || preserve_d) { if (V0Q == ARM64Reg::INVALID_REG) V0Q = fpr.GetScopedReg(); result_reg = reg_encoder(V0Q); - inaccurate_fma_reg = reg_encoder(V0Q); + nonfused_reg = reg_encoder(V0Q); - if (need_accurate_fma_reg && round_c) + if (need_fused_fma_reg && round_c) { V1Q = fpr.GetScopedReg(); rounded_c_reg = reg_encoder(V1Q); } } - else if (fma && inaccurate_fma && VD == VB) + else if (fma && nonfused && VD == VB) { if (V0Q == ARM64Reg::INVALID_REG) V0Q = fpr.GetScopedReg(); - inaccurate_fma_reg = reg_encoder(V0Q); + nonfused_reg = reg_encoder(V0Q); } } @@ -206,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst) m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1); break; case 14: // ps_madds0: d = a * c.ps0 + b - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0); - m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0); + m_float_emit.FADD(size, result_reg, nonfused_reg, VB); } else { @@ -219,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst) } break; case 15: // ps_madds1: d = a * c.ps1 + b - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1); - m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1); + m_float_emit.FADD(size, result_reg, nonfused_reg, VB); } else { @@ -245,10 +249,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg); + m_float_emit.FSUB(size, result_reg, nonfused_reg, VB); } else { @@ -263,10 +267,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; case 29: // ps_madd: d = a * c + b case 31: // ps_nmadd: d = -(a * c + b) - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg); + m_float_emit.FADD(size, result_reg, nonfused_reg, VB); } else { @@ -307,7 +311,7 @@ void JitArm64::ps_arith(UGeckoInstruction inst) // da := a - a' // (Transformed into da := a + -a') - if (inaccurate_fma) + if (nonfused) { switch (op5) {