diff --git a/Source/Core/Common/ArmCPUDetect.cpp b/Source/Core/Common/ArmCPUDetect.cpp index 4100df9f82..a9bacf6f51 100644 --- a/Source/Core/Common/ArmCPUDetect.cpp +++ b/Source/Core/Common/ArmCPUDetect.cpp @@ -63,13 +63,13 @@ CPUInfo::CPUInfo() void CPUInfo::Detect() { // Set some defaults here - // When ARMv8 CPUs come out, these need to be updated. HTT = false; OS64bit = true; CPU64bit = true; Mode64bit = true; vendor = CPUVendor::ARM; bFlushToZero = true; + bAFP = false; #ifdef __APPLE__ num_cores = std::thread::hardware_concurrency(); diff --git a/Source/Core/Common/ArmFPURoundMode.cpp b/Source/Core/Common/ArmFPURoundMode.cpp index 323e456ae7..94ed16d279 100644 --- a/Source/Core/Common/ArmFPURoundMode.cpp +++ b/Source/Core/Common/ArmFPURoundMode.cpp @@ -2,8 +2,10 @@ // Licensed under GPLv2+ // Refer to the license.txt file included. +#include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/FPURoundMode.h" +#include "Common/Logging/Log.h" #ifdef _MSC_VER #include @@ -45,8 +47,25 @@ void SetPrecisionMode(PrecisionMode mode) void SetSIMDMode(int rounding_mode, bool non_ieee_mode) { - // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0) + // When AH is disabled, FZ controls flush-to-zero for both inputs and outputs. When AH is enabled, + // FZ controls flush-to-zero for outputs, and FIZ controls flush-to-zero for inputs. constexpr u32 FZ = 1 << 24; + constexpr u32 AH = 1 << 1; + constexpr u32 FIZ = 1 << 0; + constexpr u32 flush_to_zero_mask = FZ | AH | FIZ; + + // On CPUs with FEAT_AFP support, setting AH = 1, FZ = 1, FIZ = 0 emulates the GC/Wii CPU's + // "non-IEEE mode". Unfortunately, FEAT_AFP didn't exist until 2020, so we can't count on setting + // AH actually doing anything. But flushing both inputs and outputs seems to cause less problems + // than flushing nothing, so let's just set FZ and AH and roll with whatever behavior we get. + const u32 flush_to_zero_bits = (non_ieee_mode ? FZ | AH : 0); + static bool afp_warning_shown = false; + if (!afp_warning_shown && !cpu_info.bAFP && non_ieee_mode) + { + afp_warning_shown = true; + WARN_LOG_FMT(POWERPC, + "Non-IEEE mode was requested, but host CPU is not known to support FEAT_AFP"); + } // lookup table for FPSCR.RN-to-FPCR.RMode translation constexpr u32 rounding_mode_table[] = { @@ -55,9 +74,11 @@ void SetSIMDMode(int rounding_mode, bool non_ieee_mode) (1 << 22), // +inf (2 << 22), // -inf }; + constexpr u32 rounding_mode_mask = 3 << 22; + const u32 rounding_mode_bits = rounding_mode_table[rounding_mode]; - const u64 base = default_fpcr & ~(0b111 << 22); - SetFPCR(base | rounding_mode_table[rounding_mode] | (non_ieee_mode ? FZ : 0)); + const u64 base = default_fpcr & ~(flush_to_zero_mask | rounding_mode_mask); + SetFPCR(base | rounding_mode_bits | flush_to_zero_bits); } void SaveSIMDState() diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 6157e7764e..c58033a807 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -64,6 +64,7 @@ struct CPUInfo bool bCRC32 = false; bool bSHA1 = false; bool bSHA2 = false; + bool bAFP = false; // Alternate floating-point behavior // Call Detect() explicit CPUInfo(); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index bb975f138f..b5f9e498aa 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include "Common/Arm64Emitter.h" +#include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/StringUtil.h" @@ -374,7 +375,19 @@ void JitArm64::fcmpX(UGeckoInstruction inst) const u32 b = inst.FB; const int crf = inst.CRFD; - const bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true); + // On the GC/Wii CPU, outputs are flushed to zero if FPSCR.NI is set, and inputs are never + // flushed to zero. Ideally we would emulate FPSCR.NI by setting FPCR.FZ and FPCR.AH, but + // unfortunately FPCR.AH is a very new feature that we can't rely on (as of 2021). For CPUs + // without FPCR.AH, the best we can do (without killing the performance by explicitly flushing + // outputs using bitwise operations) is to only set FPCR.FZ, which flushes both inputs and + // outputs. This may cause problems in some cases, and one such case is Pokémon Battle Revolution, + // which does not progress past the title screen if a denormal single compares equal to zero. + // Workaround: Perform the comparison using a double operation instead. This ensures that denormal + // singles behave correctly in comparisons, but we still have a problem with denormal doubles. + const bool input_ftz_workaround = + !cpu_info.bAFP && (!js.fpr_is_store_safe[a] || !js.fpr_is_store_safe[b]); + + const bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true) && !input_ftz_workaround; const RegType type = singles ? RegType::LowerPairSingle : RegType::LowerPair; const auto reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble;