Merge pull request #3927 from bylaws/winafp

ARM64EC: Set appropriate AFP and SVE256 state on JIT entry/exit
This commit is contained in:
Ryan Houdek 2024-08-08 22:21:23 -07:00 committed by GitHub
commit 85d1b573ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 51 additions and 40 deletions

View File

@ -570,6 +570,47 @@ void Arm64Emitter::PopCalleeSavedRegisters() {
}
}
void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs) {
#ifndef VIXL_SIMULATOR
if (EmitterCTX->HostFeatures.SupportsAFP) {
// Enable AFP features when filling JIT state.
mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
// Enable FPCR.NEP and FPCR.AH
// NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
// AH(1): Changes NaN behaviour in some instructions. Specifically fmin, fmax.
//
// Additional interesting AFP bits:
// FIZ(0): Flush Inputs to Zero
orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
(1U << 2) | // NEP
(1U << 1)); // AH
if (SetFIZ) {
// Insert MXCSR.DAZ in to FIZ
ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
}
msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
}
#endif
if (SetPredRegs) {
// Set up predicate registers.
// We don't bother spilling these in SpillStaticRegs,
// since all that matters is we restore them on a fill.
// It's not a concern if they get trounced by something else.
if (EmitterCTX->HostFeatures.SupportsSVE256) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_32B, ARMEmitter::PredicatePattern::SVE_VL32);
}
if (EmitterCTX->HostFeatures.SupportsSVE128) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
}
}
}
void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint32_t GPRSpillMask, uint32_t FPRSpillMask) {
#ifndef VIXL_SIMULATOR
if (EmitterCTX->HostFeatures.SupportsAFP) {
@ -689,7 +730,7 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
"spilling!");
auto TmpReg = *OptionalReg;
[[maybe_unused]] auto TmpReg2 = *OptionalReg2;
auto TmpReg2 = *OptionalReg2;
#ifdef _M_ARM_64EC
// Load STATE in from the CPU area as x28 is not callee saved in the ARM64EC ABI.
@ -697,30 +738,6 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
ldr(STATE, TmpReg, CPU_AREA_EMULATOR_DATA_OFFSET);
#endif
#ifndef VIXL_SIMULATOR
if (EmitterCTX->HostFeatures.SupportsAFP) {
// Enable AFP features when filling JIT state.
LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 1 GPR for a temp");
mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
// Enable FPCR.NEP and FPCR.AH
// NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
// AH(1): Changes NaN behaviour in some instructions. Specifically fmin, fmax.
//
// Additional interesting AFP bits:
// FIZ(0): Flush Inputs to Zero
orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
(1U << 2) | // NEP
(1U << 1)); // AH
// Insert MXCSR.DAZ in to FIZ
ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
}
#endif
// Regardless of what GPRs/FPRs we're filling, we need to fill NZCV since it
// is always static and was almost certainly clobbered.
//
@ -729,19 +746,9 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
ldr(TmpReg.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24]));
msr(ARMEmitter::SystemRegister::NZCV, TmpReg);
FillSpecialRegs(TmpReg, TmpReg2, true, FPRs);
if (FPRs) {
// Set up predicate registers.
// We don't bother spilling these in SpillStaticRegs,
// since all that matters is we restore them on a fill.
// It's not a concern if they get trounced by something else.
if (EmitterCTX->HostFeatures.SupportsSVE128) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
}
if (EmitterCTX->HostFeatures.SupportsSVE256) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_32B, ARMEmitter::PredicatePattern::SVE_VL32);
}
if (EmitterCTX->HostFeatures.SupportsAVX && EmitterCTX->HostFeatures.SupportsSVE256) {
for (size_t i = 0; i < StaticFPRegisters.size(); i++) {
const auto Reg = StaticFPRegisters[i];

View File

@ -104,6 +104,7 @@ protected:
void LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, uint64_t Constant, bool NOPPad = false);
void FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs);
// NOTE: These functions WILL clobber the register TMP4 if AVX support is enabled
// and FPRs are being spilled or filled. If only GPRs are spilled/filled, then

View File

@ -102,9 +102,7 @@ void Dispatcher::EmitDispatcher() {
add(ARMEmitter::Size::i64Bit, StaticRegisters[X86State::REG_RSP], ARMEmitter::Reg::rsp, 0);
add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, TMP1, 0);
if (EmitterCTX->HostFeatures.SupportsSVE128) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
}
FillSpecialRegs(TMP1, TMP2, false, true);
// Enter JIT
#endif

View File

@ -53,6 +53,11 @@ BeginSimulation:
// Expects the target code address in x9
.global ExitFunctionEC
ExitFunctionEC:
// Clear any the AFP NEP and AH bits in FPCR as native code won't expect their behaviour.
mrs x17, fpcr
and x17, x17, #~6 // NEP + AH
msr fpcr, x17
// Either return to an exit thunk (return to ARM64EC function) or call an entry thunk (call to ARM64EC function).
// It is assumed that a 'blr x16' instruction is only ever used to call into x86 code from an exit thunk, and that all
// exported ARM64EC functions have a 4-byte offset to their entry thunk immediately before their first instruction.