CoreState: Adds avx_high structure for tracking decoupled AVX halves.

Needed something inbetween the `InlineJITBlockHeader` and `avx_high` in
order to match alignment requirements of 16-byte for avx_high. Chose the
`DeferredSignalRefCount` because we hit it quite frequently and it is
basically the only 64-bit variable that we end up touching
significantly.

In the future the CPUState object is going to need to change its view of
the object depending on if the device supports SVE256 or not, but we
don't need to frontload the work right now. It'll become significantly
easier to support that path once the RCLSE pass gets deleted.
This commit is contained in:
Ryan Houdek 2024-06-17 17:58:48 -07:00 committed by Alyssa Rosenzweig
parent 9a71443005
commit bf812aae8f
2 changed files with 106 additions and 71 deletions

View File

@ -97,6 +97,27 @@ static void ClassifyContextStruct(ContextInfo* ContextClassificationInfo, bool S
FEXCore::IR::InvalidClass,
});
// DeferredSignalRefCount
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount),
sizeof(FEXCore::Core::CPUState::DeferredSignalRefCount),
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, avx_high[0][0]) + FEXCore::Core::CPUState::XMM_SSE_REG_SIZE * i,
FEXCore::Core::CPUState::XMM_SSE_REG_SIZE,
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
}
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, rip),
@ -117,6 +138,48 @@ static void ClassifyContextStruct(ContextInfo* ContextClassificationInfo, bool S
});
}
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, _pad),
sizeof(FEXCore::Core::CPUState::_pad),
},
LastAccessType::INVALID,
FEXCore::IR::InvalidClass,
});
if (SupportsSVE256) {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, xmm.avx.data[0][0]) + FEXCore::Core::CPUState::XMM_AVX_REG_SIZE * i,
FEXCore::Core::CPUState::XMM_AVX_REG_SIZE,
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
}
} else {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, xmm.sse.data[0][0]) + FEXCore::Core::CPUState::XMM_SSE_REG_SIZE * i,
FEXCore::Core::CPUState::XMM_SSE_REG_SIZE,
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
}
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, xmm.sse.pad[0][0]),
static_cast<uint16_t>(FEXCore::Core::CPUState::XMM_SSE_REG_SIZE * FEXCore::Core::CPUState::NUM_XMMS),
},
LastAccessType::INVALID,
FEXCore::IR::InvalidClass,
});
}
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, es_idx),
@ -173,8 +236,8 @@ static void ClassifyContextStruct(ContextInfo* ContextClassificationInfo, bool S
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, _pad),
sizeof(FEXCore::Core::CPUState::_pad),
offsetof(FEXCore::Core::CPUState, _pad2),
sizeof(FEXCore::Core::CPUState::_pad2),
},
LastAccessType::INVALID,
FEXCore::IR::InvalidClass,
@ -234,39 +297,6 @@ static void ClassifyContextStruct(ContextInfo* ContextClassificationInfo, bool S
FEXCore::IR::InvalidClass,
});
if (SupportsSVE256) {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, xmm.avx.data[0][0]) + FEXCore::Core::CPUState::XMM_AVX_REG_SIZE * i,
FEXCore::Core::CPUState::XMM_AVX_REG_SIZE,
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
}
} else {
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, xmm.sse.data[0][0]) + FEXCore::Core::CPUState::XMM_SSE_REG_SIZE * i,
FEXCore::Core::CPUState::XMM_SSE_REG_SIZE,
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
}
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, xmm.sse.pad[0][0]),
static_cast<uint16_t>(FEXCore::Core::CPUState::XMM_SSE_REG_SIZE * FEXCore::Core::CPUState::NUM_XMMS),
},
LastAccessType::INVALID,
FEXCore::IR::InvalidClass,
});
}
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_FLAGS; ++i) {
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
@ -337,21 +367,11 @@ static void ClassifyContextStruct(ContextInfo* ContextClassificationInfo, bool S
FEXCore::IR::InvalidClass,
});
// _pad2
// _pad3
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, _pad2),
sizeof(FEXCore::Core::CPUState::_pad2),
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
});
// DeferredSignalRefCount
ContextClassification->emplace_back(ContextMemberInfo {
ContextMemberClassification {
offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount),
sizeof(FEXCore::Core::CPUState::DeferredSignalRefCount),
offsetof(FEXCore::Core::CPUState, _pad3),
sizeof(FEXCore::Core::CPUState::_pad3),
},
LastAccessType::NONE,
FEXCore::IR::InvalidClass,
@ -385,12 +405,21 @@ static void ResetClassificationAccesses(ContextInfo* ContextClassificationInfo,
ContextClassification->at(Offset).AccessOffset = 0;
ContextClassification->at(Offset).StoreNode = nullptr;
};
size_t Offset = 0;
///< InlineJITBlockHeader
SetAccess(Offset++, LastAccessType::INVALID);
///< rip
// DeferredSignalRefCount
SetAccess(Offset++, LastAccessType::INVALID);
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
///< avx_high
SetAccess(Offset++, LastAccessType::NONE);
}
// rip
SetAccess(Offset++, LastAccessType::NONE);
///< gregs
@ -398,6 +427,19 @@ static void ResetClassificationAccesses(ContextInfo* ContextClassificationInfo,
SetAccess(Offset++, LastAccessType::NONE);
}
// pad
SetAccess(Offset++, LastAccessType::NONE);
// xmm
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
SetAccess(Offset++, LastAccessType::NONE);
}
// xmm_pad
if (!SupportsSVE256) {
SetAccess(Offset++, LastAccessType::NONE);
}
// Segment indexes
SetAccess(Offset++, LastAccessType::NONE);
SetAccess(Offset++, LastAccessType::NONE);
@ -406,7 +448,7 @@ static void ResetClassificationAccesses(ContextInfo* ContextClassificationInfo,
SetAccess(Offset++, LastAccessType::NONE);
SetAccess(Offset++, LastAccessType::NONE);
// Pad
// Pad2
SetAccess(Offset++, LastAccessType::INVALID);
// Segments
@ -417,16 +459,6 @@ static void ResetClassificationAccesses(ContextInfo* ContextClassificationInfo,
SetAccess(Offset++, LastAccessType::NONE);
SetAccess(Offset++, LastAccessType::NONE);
///< xmm
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_XMMS; ++i) {
SetAccess(Offset++, LastAccessType::NONE);
}
if (!SupportsSVE256) {
///< xmm pad if AVX isn't supported.
SetAccess(Offset++, LastAccessType::NONE);
}
///< flags
for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_FLAGS; ++i) {
SetAccess(Offset++, LastAccessType::NONE);
@ -454,10 +486,7 @@ static void ResetClassificationAccesses(ContextInfo* ContextClassificationInfo,
///< AbridgedFTW
SetAccess(Offset++, LastAccessType::NONE);
///< _pad2
SetAccess(Offset++, LastAccessType::INVALID);
///< DeferredSignalRefCount
// pad3
SetAccess(Offset++, LastAccessType::INVALID);
}

View File

@ -89,18 +89,27 @@ struct CPUState {
};
uint64_t InlineJITBlockHeader {};
// Reference counter for FEX's per-thread deferred signals.
// Counts the nesting depth of program sections that cause signals to be deferred.
NonAtomicRefCounter<uint64_t> DeferredSignalRefCount;
// The high 128-bits of AVX registers when not being emulated by SVE256.
uint64_t avx_high[16][2];
uint64_t rip {}; ///< Current core's RIP. May not be entirely accurate while JIT is active
uint64_t gregs[16] {};
uint64_t _pad {};
XMMRegs xmm {};
// Raw segment register indexes
uint16_t es_idx {}, cs_idx {}, ss_idx {}, ds_idx {};
uint16_t gs_idx {}, fs_idx {};
uint16_t _pad[2];
uint16_t _pad2[2];
// Segment registers holding base addresses
uint32_t es_cached {}, cs_cached {}, ss_cached {}, ds_cached {};
uint64_t gs_cached {};
uint64_t fs_cached {};
XMMRegs xmm {};
uint8_t flags[48] {};
uint64_t pf_raw {};
uint64_t af_raw {};
@ -113,11 +122,7 @@ struct CPUState {
uint16_t FCW {0x37F};
uint8_t AbridgedFTW {};
uint8_t _pad2[5];
// Reference counter for FEX's per-thread deferred signals.
// Counts the nesting depth of program sections that cause signals to be deferred.
NonAtomicRefCounter<uint64_t> DeferredSignalRefCount;
uint8_t _pad3[5];
// PF/AF are statically mapped as-if they were r16/r17 (which do not exist in
// x86 otherwise). This allows a straightforward mapping for SRA.
static constexpr uint8_t PF_AS_GREG = 16;
@ -161,6 +166,7 @@ struct CPUState {
};
static_assert(std::is_trivially_copyable_v<CPUState>, "Needs to be trivial");
static_assert(std::is_standard_layout_v<CPUState>, "This needs to be standard layout");
static_assert(offsetof(CPUState, avx_high) % 16 == 0, "avx_high needs to be 128-bit aligned!");
static_assert(offsetof(CPUState, xmm) % 32 == 0, "xmm needs to be 256-bit aligned!");
static_assert(offsetof(CPUState, mm) % 16 == 0, "mm needs to be 128-bit aligned!");
static_assert(offsetof(CPUState, gregs[15]) <= 504, "gregs maximum offset must be <= 504 for ldp/stp to work");