OpcodeDispatcher: reg cache mmx

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
This commit is contained in:
Alyssa Rosenzweig 2024-07-03 08:51:10 -04:00
parent a4f8bbff02
commit 294f10fdd0
3 changed files with 59 additions and 10 deletions

View File

@ -4374,7 +4374,9 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
const auto highIndex = Operand.Data.GPR.HighBits ? 1 : 0;
if (gpr >= FEXCore::X86State::REG_MM_0) {
A.Base = _LoadContext(OpSize, FPRClass, offsetof(FEXCore::Core::CPUState, mm[gpr - FEXCore::X86State::REG_MM_0]));
LOGMAN_THROW_A_FMT(OpSize == 8, "full");
A.Base = LoadContext(8, MM0Index + gpr - FEXCore::X86State::REG_MM_0);
} else if (gpr >= FEXCore::X86State::REG_XMM_0) {
const auto gprIndex = gpr - X86State::REG_XMM_0;
@ -4460,7 +4462,12 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
const auto gpr = Operand.Data.GPR.GPR;
if (gpr >= FEXCore::X86State::REG_MM_0) {
_StoreContext(OpSize, Class, Src, offsetof(FEXCore::Core::CPUState, mm[gpr - FEXCore::X86State::REG_MM_0]));
LOGMAN_THROW_A_FMT(OpSize == 8, "full");
LOGMAN_THROW_A_FMT(Class == FPRClass, "MMX is floaty");
// Partial store into bottom 64-bits, leave the upper bits unaffected.
// XXX: We actually should set the upper bits to all-1s?
StoreContextPartial(MM0Index + gpr - FEXCore::X86State::REG_MM_0, Src);
} else if (gpr >= FEXCore::X86State::REG_XMM_0) {
const auto gprIndex = gpr - X86State::REG_XMM_0;
const auto VectorSize = (CTX->HostFeatures.SupportsSVE256 && CTX->HostFeatures.SupportsAVX) ? 32 : 16;

View File

@ -1302,7 +1302,9 @@ public:
} else if (Index == DFIndex) {
_StoreFlag(Value, X86State::RFLAG_DF_RAW_LOC);
} else {
_StoreContext(CacheIndexToSize(Index), CacheIndexClass(Index), Value, CacheIndexToContextOffset(Index));
bool Partial = RegCache.Partial & (1ull << Index);
unsigned Size = Partial ? 8 : CacheIndexToSize(Index);
_StoreContext(Size, CacheIndexClass(Index), Value, CacheIndexToContextOffset(Index));
}
Bits &= ~(1ull << Index);
@ -1310,6 +1312,7 @@ public:
RegCache.Written &= ~Mask;
RegCache.Cached &= ~Mask;
RegCache.Partial &= ~Mask;
}
protected:
@ -1814,6 +1817,9 @@ private:
static const int GPR15Index = 15;
static const int PFIndex = 16;
static const int AFIndex = 17;
/* Gap 18..19 */
static const int MM0Index = 20;
static const int MM7Index = 27;
static const int AbridgedFTWIndex = 28;
/* Gap 29..30 */
static const int DFIndex = 31;
@ -1824,6 +1830,7 @@ private:
int CacheIndexToContextOffset(int Index) {
switch (Index) {
case MM0Index ... MM7Index: return offsetof(FEXCore::Core::CPUState, mm[Index - MM0Index]);
case AVXHigh0Index ... AVXHigh15Index: return offsetof(FEXCore::Core::CPUState, avx_high[Index - AVXHigh0Index][0]);
case AbridgedFTWIndex: return offsetof(FEXCore::Core::CPUState, AbridgedFTW);
default: return -1;
@ -1831,7 +1838,7 @@ private:
}
RegisterClassType CacheIndexClass(int Index) {
if (Index >= FPR0Index) {
if ((Index >= MM0Index && Index <= MM7Index) || Index >= FPR0Index) {
return FPRClass;
} else {
return GPRClass;
@ -1839,7 +1846,9 @@ private:
}
unsigned CacheIndexToSize(int Index) {
if (Index >= AVXHigh0Index) {
// MMX registers are rounded up to 128-bit since they are shared with 80-bit
// x87 registers, even though MMX is logically only 64-bit.
if (Index >= AVXHigh0Index || ((Index >= MM0Index && Index <= MM7Index))) {
return 16;
} else {
return 1;
@ -1849,6 +1858,11 @@ private:
struct {
uint64_t Cached;
uint64_t Written;
// Indicates that Value contains only the lower 64-bit of the full 80-bit
// register. Used for MMX/x87 optimization.
uint64_t Partial;
Ref Value[64];
} RegCache {};
@ -1862,11 +1876,29 @@ private:
LOGMAN_THROW_AA_FMT(Index < 64, "valid index");
uint64_t Bit = (1ull << (uint64_t)Index);
if (Size == 16 && (RegCache.Partial & Bit)) {
// We need to load the full register extend if we previously did a partial access.
Ref Value = RegCache.Value[Index];
Ref Full = _LoadContext(Size, RegClass, Offset);
// If we did a partial store, we're inserting into the full register
if (RegCache.Written & Bit) {
Full = _VInsElement(16, 8, 0, 0, Full, Value);
}
RegCache.Value[Index] = Full;
}
if (!(RegCache.Cached & Bit)) {
if (Index == DFIndex) {
RegCache.Value[Index] = _LoadDF();
} else if (Index == AbridgedFTWIndex || Index >= AVXHigh0Index) {
} else if ((Index >= MM0Index && Index <= AbridgedFTWIndex) || Index >= AVXHigh0Index) {
RegCache.Value[Index] = _LoadContext(Size, RegClass, Offset);
// We may have done a partial load, this requires special handling.
if (Size == 8) {
RegCache.Partial |= Bit;
}
} else {
RegCache.Value[Index] = _LoadRegister(Offset, RegClass, Size);
}
@ -1881,8 +1913,12 @@ private:
return LoadRegCache(Reg, GPR0Index + Reg, GPRClass, CTX->GetGPRSize());
}
Ref LoadContext(uint8_t Size, uint8_t Index) {
return LoadRegCache(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size);
}
Ref LoadContext(uint8_t Index) {
return LoadRegCache(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), CacheIndexToSize(Index));
return LoadContext(CacheIndexToSize(Index), Index);
}
Ref LoadXMMRegister(uint8_t Reg) {
@ -1904,6 +1940,12 @@ private:
RegCache.Written |= Bit;
}
void StoreContextPartial(uint8_t Index, Ref Value) {
StoreContext(Index, Value);
RegCache.Partial |= (1ull << (uint64_t)Index);
}
void StoreRegister(uint8_t Reg, bool FPR, Ref Value) {
StoreContext(Reg + (FPR ? FPR0Index : GPR0Index), Value);
}

View File

@ -2747,7 +2747,7 @@ void OpDispatchBuilder::SaveX87State(OpcodeArgs, Ref MemBase) {
// If OSFXSR bit in CR4 is not set than FXSAVE /may/ not save the XMM registers
// This is implementation dependent
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) {
Ref MMReg = _LoadContext(16, FPRClass, offsetof(FEXCore::Core::CPUState, mm[i]));
Ref MMReg = LoadContext(MM0Index + i);
_StoreMem(FPRClass, 16, MMReg, MemBase, _Constant(i * 16 + 32), 16, MEM_OFFSET_SXTX, 1);
}
@ -2879,7 +2879,7 @@ void OpDispatchBuilder::RestoreX87State(Ref MemBase) {
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) {
auto MMReg = _LoadMem(FPRClass, 16, MemBase, _Constant(i * 16 + 32), 16, MEM_OFFSET_SXTX, 1);
_StoreContext(16, FPRClass, MMReg, offsetof(FEXCore::Core::CPUState, mm[i]));
StoreContext(MM0Index + i, MMReg);
}
}
@ -2918,7 +2918,7 @@ void OpDispatchBuilder::DefaultX87State(OpcodeArgs) {
// all of the ST0-7/MM0-7 registers to zero.
Ref ZeroVector = LoadZeroVector(Core::CPUState::MM_REG_SIZE);
for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) {
_StoreContext(16, FPRClass, ZeroVector, offsetof(FEXCore::Core::CPUState, mm[i]));
StoreContext(MM0Index + i, ZeroVector);
}
}