Arm64: Implements support for DAZ using AFP.FIZ

When AFP is supported then we can actually support DAZ. This might also
fix the audio corruption in Animal Well but I can't test it until Steam
is running on Oryon. Requires a bit of plumbing for MXCSR which we were
hacking around before but now we actually want to store the value.

Fixes #3856
This commit is contained in:
Ryan Houdek 2024-07-20 15:29:04 -07:00
parent 54fc8cb0bd
commit b78da2e5ad
No known key found for this signature in database
7 changed files with 51 additions and 24 deletions

View File

@ -575,7 +575,7 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
if (EmitterCTX->HostFeatures.SupportsAFP) {
// Disable AFP features when spilling registers.
//
// Disable FPCR.NEP and FPCR.AH
// Disable FPCR.NEP and FPCR.AH and FPCR.FIZ
// NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination.
// AH(1): Changes NaN behaviour in some instructions. Specifically fmin, fmax.
// Also interacts with RPRES to change reciprocal/rsqrt precision from 8-bit mantissa to 12-bit.
@ -585,7 +585,8 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
mrs(TmpReg, ARMEmitter::SystemRegister::FPCR);
bic(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
(1U << 2) | // NEP
(1U << 1)); // AH
(1U << 1) | // AH
(1U << 0)); // FIZ
msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
}
#endif
@ -664,18 +665,24 @@ void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, bool FPRs, uint3
}
void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask) {
ARMEmitter::Register TmpReg = ARMEmitter::Reg::r0;
LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 1 GPR for a temp");
[[maybe_unused]] bool FoundRegister {};
for (auto Reg : StaticRegisters) {
if (((1U << Reg.Idx()) & GPRFillMask)) {
TmpReg = Reg;
FoundRegister = true;
break;
auto FindTempReg = [this](uint32_t* GPRFillMask) -> std::optional<ARMEmitter::Register> {
for (auto Reg : StaticRegisters) {
if (((1U << Reg.Idx()) & *GPRFillMask)) {
*GPRFillMask &= ~(1U << Reg.Idx());
return std::make_optional(Reg);
}
}
}
return std::nullopt;
};
LOGMAN_THROW_A_FMT(FoundRegister, "Didn't have an SRA register to use as a temporary while spilling!");
LOGMAN_THROW_A_FMT(GPRFillMask != 0, "Must fill at least 2 GPRs for a temp");
uint32_t TempGPRFillMask = GPRFillMask;
auto Reg = FindTempReg(&TempGPRFillMask);
auto Reg2 = FindTempReg(&TempGPRFillMask);
LOGMAN_THROW_A_FMT(Reg.has_value() && Reg2.has_value(), "Didn't have an SRA register to use as a temporary while spilling!");
auto TmpReg = *Reg;
[[maybe_unused]] auto TmpReg2 = *Reg2;
#ifndef VIXL_SIMULATOR
if (EmitterCTX->HostFeatures.SupportsAFP) {
@ -692,6 +699,11 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg,
(1U << 2) | // NEP
(1U << 1)); // AH
// Insert MXCSR.DAZ in to FIZ
ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr));
bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1);
msr(ARMEmitter::SystemRegister::FPCR, TmpReg);
}
#endif

View File

@ -98,6 +98,7 @@ DEF_OP(GetRoundingMode) {
DEF_OP(SetRoundingMode) {
auto Op = IROp->C<IR::IROp_SetRoundingMode>();
auto Src = GetReg(Op->RoundMode.ID());
auto MXCSR = GetReg(Op->MXCSR.ID());
// As above, setup the rounding flags in [31:30]
rbit(ARMEmitter::Size::i32Bit, TMP2, Src);
@ -116,6 +117,11 @@ DEF_OP(SetRoundingMode) {
lsr(ARMEmitter::Size::i64Bit, TMP2, Src, 2);
bfi(ARMEmitter::Size::i64Bit, TMP1, TMP2, 24, 1);
if (Op->SetDAZ && HostSupportsAFP) {
// Extract DAZ from MXCSR and insert to in FPCR.FIZ
bfxil(ARMEmitter::Size::i64Bit, TMP1, MXCSR, 6, 1);
}
// Now save the new FPCR
msr(ARMEmitter::SystemRegister::FPCR, TMP1);
}

View File

@ -2599,10 +2599,11 @@ void OpDispatchBuilder::SaveAVXState(Ref MemBase) {
}
Ref OpDispatchBuilder::GetMXCSR() {
// Default MXCSR Value
Ref MXCSR = _Constant(0x1F80);
Ref RoundingMode = _GetRoundingMode();
return _Bfi(OpSize::i32Bit, 3, 13, MXCSR, RoundingMode);
Ref MXCSR = _LoadContext(OpSize::i32Bit, GPRClass, offsetof(FEXCore::Core::CPUState, mxcsr));
// Mask out unsupported bits
// Keeps FZ, RC, exception masks, and DAZ
MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
return MXCSR;
}
void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) {
@ -2711,9 +2712,13 @@ void OpDispatchBuilder::RestoreSSEState(Ref MemBase) {
}
void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) {
// Mask out unsupported bits
MXCSR = _And(OpSize::i32Bit, MXCSR, _Constant(0xFFC0));
_StoreContext(OpSize::i32Bit, GPRClass, MXCSR, offsetof(FEXCore::Core::CPUState, mxcsr));
// We only support the rounding mode and FTZ bit being set
Ref RoundingMode = _Bfe(OpSize::i32Bit, 3, 13, MXCSR);
_SetRoundingMode(RoundingMode);
_SetRoundingMode(RoundingMode, true, MXCSR);
}
void OpDispatchBuilder::RestoreAVXState(Ref MemBase) {

View File

@ -48,7 +48,7 @@ void OpDispatchBuilder::FNINITF64(OpcodeArgs) {
auto NewFCW = _Constant(16, 0x037F);
// Init host rounding mode to zero
auto Zero = _Constant(0);
_SetRoundingMode(Zero);
_SetRoundingMode(Zero, false, Zero);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
// Init FSW to 0
@ -71,7 +71,7 @@ void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
// ignore the rounding precision, we're always 64-bit in F64.
// extract rounding mode
Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
_SetRoundingMode(roundingMode);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
@ -89,7 +89,7 @@ void OpDispatchBuilder::X87FLDCWF64(OpcodeArgs) {
// ignore the rounding precision, we're always 64-bit in F64.
// extract rounding mode
Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW);
_SetRoundingMode(roundingMode);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
}
@ -783,7 +783,7 @@ void OpDispatchBuilder::X87FRSTORF64(OpcodeArgs) {
auto roundMask = _Constant(3);
roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift);
roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask);
_SetRoundingMode(roundingMode);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

View File

@ -226,7 +226,7 @@
"DestSize": "4"
},
"SetRoundingMode GPR:$RoundMode": {
"SetRoundingMode GPR:$RoundMode, i1:$SetDAZ, GPR:$MXCSR": {
"Desc": ["Sets the current rounding mode options for the thread"
],
"HasSideEffects": true

View File

@ -104,7 +104,7 @@ struct CPUState {
// Raw segment register indexes
uint16_t es_idx {}, cs_idx {}, ss_idx {}, ds_idx {};
uint16_t gs_idx {}, fs_idx {};
uint16_t _pad2[2];
uint32_t mxcsr {};
// Segment registers holding base addresses
uint32_t es_cached {}, cs_cached {}, ss_cached {}, ds_cached {};
@ -162,6 +162,10 @@ struct CPUState {
// we encode DF as 1/-1 within the JIT, so we have to write 0x1 here to
// zero DF.
flags[X86State::RFLAG_DF_RAW_LOC] = 0x1;
// Default mxcsr value
// All exception masks enabled.
mxcsr = 0x1F80;
}
};
static_assert(std::is_trivially_copyable_v<CPUState>, "Needs to be trivial");

View File

@ -2,7 +2,7 @@
{
"HostFeatures": ["AVX"],
"RegData": {
"RAX": "0xFF80"
"RAX": "0xFFC0"
},
"MemoryRegions": {
"0x100000000": "4096"