Merge pull request #3927 from bylaws/winafp

ARM64EC: Set appropriate AFP and SVE256 state on JIT entry/exit
2025-02-08 07:38:47 +00:00 · 2024-08-08 22:21:23 -07:00 · 2024-08-08 22:21:23 -07:00 · 85d1b573ef
commit 85d1b573ef
parent 7b1d9540b7 fe43a2bcb2
4 changed files with 51 additions and 40 deletions
--- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
+++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@ -570,6 +570,47 @@ void Arm64Emitter::PopCalleeSavedRegisters() {
  }
 }

+void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs) {
+#ifndef VIXL_SIMULATOR
+  if (EmitterCTX->HostFeatures.SupportsAFP) {
+    // Enable AFP features when filling JIT state.
+    mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
+
+    // Enable FPCR.NEP and FPCR.AH
+    // NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
+    // AH(1):  Changes NaN behaviour in some instructions. Specifically fmin, fmax.
+    //
+    // Additional interesting AFP bits:
+    // FIZ(0): Flush Inputs to Zero
+    orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
+        (1U << 2) |   // NEP
+          (1U << 1)); // AH
+
+    if (SetFIZ) {
+      // Insert MXCSR.DAZ in to FIZ
+      ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
+      bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
+    }
+
+    msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
+  }
+#endif
+
+  if (SetPredRegs) {
+    // Set up predicate registers.
+    // We don't bother spilling these in SpillStaticRegs,
+    // since all that matters is we restore them on a fill.
+    // It's not a concern if they get trounced by something else.
+    if (EmitterCTX->HostFeatures.SupportsSVE256) {
+      ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_32B, ARMEmitter::PredicatePattern::SVE_VL32);
+    }
+
+    if (EmitterCTX->HostFeatures.SupportsSVE128) {
+      ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
+    }
+  }
+}
+
 void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint32_t GPRSpillMask, uint32_t FPRSpillMask) {
 #ifndef VIXL_SIMULATOR
  if (EmitterCTX->HostFeatures.SupportsAFP) {
@ -689,7 +730,7 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
                                                                          "spilling!");

  auto TmpReg = *OptionalReg;
-  [[maybe_unused]] auto TmpReg2 = *OptionalReg2;
+  auto TmpReg2 = *OptionalReg2;

 #ifdef _M_ARM_64EC
  // Load STATE in from the CPU area as x28 is not callee saved in the ARM64EC ABI.
@ -697,30 +738,6 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
  ldr(STATE, TmpReg, CPU_AREA_EMULATOR_DATA_OFFSET);
 #endif

-#ifndef VIXL_SIMULATOR
-  if (EmitterCTX->HostFeatures.SupportsAFP) {
-    // Enable AFP features when filling JIT state.
-    LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 1 GPR for a temp");
-    mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
-
-    // Enable FPCR.NEP and FPCR.AH
-    // NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
-    // AH(1):  Changes NaN behaviour in some instructions. Specifically fmin, fmax.
-    //
-    // Additional interesting AFP bits:
-    // FIZ(0): Flush Inputs to Zero
-    orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
-        (1U << 2) |   // NEP
-          (1U << 1)); // AH
-
-    // Insert MXCSR.DAZ in to FIZ
-    ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
-    bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
-
-    msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
-  }
-#endif
-
  // Regardless of what GPRs/FPRs we're filling, we need to fill NZCV since it
  // is always static and was almost certainly clobbered.
  //
@ -729,19 +746,9 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
  ldr(TmpReg.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24]));
  msr(ARMEmitter::SystemRegister::NZCV, TmpReg);

+  FillSpecialRegs(TmpReg, TmpReg2, true, FPRs);
+
  if (FPRs) {
-    // Set up predicate registers.
-    // We don't bother spilling these in SpillStaticRegs,
-    // since all that matters is we restore them on a fill.
-    // It's not a concern if they get trounced by something else.
-    if (EmitterCTX->HostFeatures.SupportsSVE128) {
-      ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
-    }
-
-    if (EmitterCTX->HostFeatures.SupportsSVE256) {
-      ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_32B, ARMEmitter::PredicatePattern::SVE_VL32);
-    }
-
    if (EmitterCTX->HostFeatures.SupportsAVX && EmitterCTX->HostFeatures.SupportsSVE256) {
      for (size_t i = 0; i < StaticFPRegisters.size(); i++) {
        const auto Reg = StaticFPRegisters[i];
--- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
+++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
@ -104,6 +104,7 @@ protected:

  void LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, uint64_t Constant, bool NOPPad = false);

+  void FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs);

  // NOTE: These functions WILL clobber the register TMP4 if AVX support is enabled
  //       and FPRs are being spilled or filled. If only GPRs are spilled/filled, then
--- a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp
+++ b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp
@ -102,9 +102,7 @@ void Dispatcher::EmitDispatcher() {
  add(ARMEmitter::Size::i64Bit, StaticRegisters[X86State::REG_RSP], ARMEmitter::Reg::rsp, 0);
  add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, TMP1, 0);

-  if (EmitterCTX->HostFeatures.SupportsSVE128) {
-    ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
-  }
+  FillSpecialRegs(TMP1, TMP2, false, true);

  // Enter JIT
 #endif
--- a/Source/Windows/ARM64EC/Module.S
+++ b/Source/Windows/ARM64EC/Module.S
@ -53,6 +53,11 @@ BeginSimulation:
  // Expects the target code address in x9
 .global ExitFunctionEC
 ExitFunctionEC:
+  // Clear any the AFP NEP and AH bits in FPCR as native code won't expect their behaviour.
+  mrs x17, fpcr
+  and x17, x17, #~6 // NEP + AH
+  msr fpcr, x17
+
  // Either return to an exit thunk (return to ARM64EC function) or call an entry thunk (call to ARM64EC function).
  // It is assumed that a 'blr x16' instruction is only ever used to call into x86 code from an exit thunk, and that all
  // exported ARM64EC functions have a 4-byte offset to their entry thunk immediately before their first instruction.