Arm64: Implements support for DAZ using AFP.FIZ

When AFP is supported then we can actually support DAZ. This might also fix the audio corruption in Animal Well but I can't test it until Steam is running on Oryon. Requires a bit of plumbing for MXCSR which we were hacking around before but now we actually want to store the value. Fixes #3856
2025-01-22 06:20:58 +00:00 · 2024-07-20 15:29:04 -07:00 · 2024-07-20 15:29:04 -07:00 · b78da2e5ad
commit b78da2e5ad
parent 54fc8cb0bd
7 changed files with 51 additions and 24 deletions
--- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
+++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@ -575,7 +575,7 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
  if (EmitterCTX->HostFeatures.SupportsAFP) {
    // Disable AFP features when spilling registers.
    //
-    // Disable FPCR.NEP and FPCR.AH
+    // Disable FPCR.NEP and FPCR.AH and FPCR.FIZ
    // NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
    // AH(1):  Changes NaN behaviour in some instructions. Specifically fmin, fmax.
    //         Also interacts with RPRES to change reciprocal/rsqrt precision from 8-bit mantissa to 12-bit.
@ -585,7 +585,8 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
    mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
    bic(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
        (1U << 2) |   // NEP
-          (1U << 1)); // AH
+          (1U << 1) | // AH
+          (1U << 0)); // FIZ
    msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
  }
 #endif
@ -664,18 +665,24 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
 }

 void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask) {
-  ARMEmitter::Register TmpReg = ARMEmitter::Reg::r0;
-  LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 1 GPR for a temp");
-  [[maybe_unused]] bool FoundRegister {};
-  for (auto Reg : StaticRegisters) {
-    if (((1U << Reg.Idx()) & GPRFillMask)) {
-      TmpReg = Reg;
-      FoundRegister = true;
-      break;
+  auto FindTempReg = [this](uint32_t* GPRFillMask) -> std::optional<ARMEmitter::Register> {
+    for (auto Reg : StaticRegisters) {
+      if (((1U << Reg.Idx()) & *GPRFillMask)) {
+        *GPRFillMask &= ~(1U << Reg.Idx());
+        return std::make_optional(Reg);
+      }
    }
-  }
+    return std::nullopt;
+  };

-  LOGMAN_THROW_A_FMT(FoundRegister, "Didn't have an SRA register to use as a temporary while spilling!");
+  LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 2 GPRs for a temp");
+  uint32_t TempGPRFillMask = GPRFillMask;
+  auto Reg = FindTempReg(&TempGPRFillMask);
+  auto Reg2 = FindTempReg(&TempGPRFillMask);
+  LOGMAN_THROW_A_FMT(Reg.has_value() && Reg2.has_value(), "Didn't have an SRA register to use as a temporary while spilling!");
+
+  auto TmpReg = *Reg;
+  [[maybe_unused]] auto TmpReg2 = *Reg2;

 #ifndef VIXL_SIMULATOR
  if (EmitterCTX->HostFeatures.SupportsAFP) {
@ -692,6 +699,11 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
    orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
        (1U << 2) |   // NEP
          (1U << 1)); // AH
+
+    // Insert MXCSR.DAZ in to FIZ
+    ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
+    bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
+
    msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
  }
 #endif
--- a/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MiscOps.cpp
@ -98,6 +98,7 @@ DEF_OP(GetRoundingMode) {
 DEF_OP(SetRoundingMode) {
  auto Op = IROp->C<IR::IROp_SetRoundingMode>();
  auto Src = GetReg(Op->RoundMode.ID());
+  auto MXCSR = GetReg(Op->MXCSR.ID());

  // As above, setup the rounding flags in [31:30]
  rbit(ARMEmitter::Size::i32Bit, TMP2, Src);
@ -116,6 +117,11 @@ DEF_OP(SetRoundingMode) {
  lsr(ARMEmitter::Size::i64Bit, TMP2, Src, 2);
  bfi(ARMEmitter::Size::i64Bit, TMP1, TMP2, 24, 1);

+  if (Op->SetDAZ && HostSupportsAFP) {
+    // Extract DAZ from MXCSR and insert to in FPCR.FIZ
+    bfxil(ARMEmitter::Size::i64Bit, TMP1, MXCSR, 6, 1);
+  }
+
  // Now save the new FPCR
  msr(ARMEmitter::SystemRegister::FPCR, TMP1);
 }
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -2599,10 +2599,11 @@ void OpDispatchBuilder::SaveAVXState(Ref MemBase) {
 }

 Ref OpDispatchBuilder::GetMXCSR() {
-  // Default MXCSR Value
-  Ref MXCSR = _Constant(0x1F80);
-  Ref RoundingMode = _GetRoundingMode();
-  return _Bfi(OpSize::i32Bit, 3, 13, MXCSR, RoundingMode);
+  Ref MXCSR = _LoadContext(OpSize::i32Bit, GPRClass, offsetof(FEXCore::Core::CPUState, mxcsr));
+  // Mask out unsupported bits
+  // Keeps FZ, RC, exception masks, and DAZ
+  MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
+  return MXCSR;
 }

 void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) {
@ -2711,9 +2712,13 @@ void OpDispatchBuilder::RestoreSSEState(Ref MemBase) {
 }

 void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) {
+  // Mask out unsupported bits
+  MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
+
+  _StoreContext(OpSize::i32Bit, GPRClass, MXCSR, offsetof(FEXCore::Core::CPUState, mxcsr));
  // We only support the rounding mode and FTZ bit being set
  Ref RoundingMode = _Bfe(OpSize::i32Bit, 3, 13, MXCSR);
-  _SetRoundingMode(RoundingMode);
+  _SetRoundingMode(RoundingMode, true, MXCSR);
 }

 void OpDispatchBuilder::RestoreAVXState(Ref MemBase) {
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
@ -48,7 +48,7 @@ void OpDispatchBuilder::FNINITF64(OpcodeArgs) {
  auto NewFCW = _Constant(16, 0x037F);
  // Init host rounding mode to zero
  auto Zero = _Constant(0);
-  _SetRoundingMode(Zero);
+  _SetRoundingMode(Zero, false, Zero);
  _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

  // Init FSW to 0
@ -71,7 +71,7 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
  // ignore the rounding precision, we're always 64-bit in F64.
  // extract rounding mode
  Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
-  _SetRoundingMode(roundingMode);
+  _SetRoundingMode(roundingMode, false, roundingMode);
  _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

  auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
@ -89,7 +89,7 @@ void OpDispatchBuilder::X87FLDCWF64(OpcodeArgs) {
  // ignore the rounding precision, we're always 64-bit in F64.
  // extract rounding mode
  Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
-  _SetRoundingMode(roundingMode);
+  _SetRoundingMode(roundingMode, false, roundingMode);
  _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
 }

@ -783,7 +783,7 @@ void OpDispatchBuilder::X87FRSTORF64(OpcodeArgs) {
  auto roundMask = _Constant(3);
  roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift);
  roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask);
-  _SetRoundingMode(roundingMode);
+  _SetRoundingMode(roundingMode, false, roundingMode);
  _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
  _StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

--- a/FEXCore/Source/Interface/IR/IR.json
+++ b/FEXCore/Source/Interface/IR/IR.json
@ -226,7 +226,7 @@
        "DestSize": "4"
      },

-      "SetRoundingMode GPR:$RoundMode": {
+      "SetRoundingMode GPR:$RoundMode, i1:$SetDAZ, GPR:$MXCSR": {
        "Desc": ["Sets the current rounding mode options for the thread"
                ],
        "HasSideEffects": true
--- a/FEXCore/include/FEXCore/Core/CoreState.h
+++ b/FEXCore/include/FEXCore/Core/CoreState.h
@ -104,7 +104,7 @@ struct CPUState {
  // Raw segment register indexes
  uint16_t es_idx {}, cs_idx {}, ss_idx {}, ds_idx {};
  uint16_t gs_idx {}, fs_idx {};
-  uint16_t _pad2[2];
+  uint32_t mxcsr {};

  // Segment registers holding base addresses
  uint32_t es_cached {}, cs_cached {}, ss_cached {}, ds_cached {};
@ -162,6 +162,10 @@ struct CPUState {
    // we encode DF as 1/-1 within the JIT, so we have to write 0x1 here to
    // zero DF.
    flags[X86State::RFLAG_DF_RAW_LOC] = 0x1;
+
+    // Default mxcsr value
+    // All exception masks enabled.
+    mxcsr = 0x1F80;
  }
 };
 static_assert(std::is_trivially_copyable_v<CPUState>, "Needs to be trivial");
--- a/unittests/ASM/VEX/vldmxcsr.asm
+++ b/unittests/ASM/VEX/vldmxcsr.asm
@ -2,7 +2,7 @@
 {
  "HostFeatures": ["AVX"],
  "RegData": {
-    "RAX": "0xFF80"
+    "RAX": "0xFFC0"
  },
  "MemoryRegions": {
    "0x100000000": "4096"