Merge pull request #2255 from Sonicadvance1/optimize_sve_spillfill

Arm64: Optimize SVE register spilling and filling
This commit is contained in:
Mai 2022-12-16 13:19:05 +00:00 committed by GitHub
commit 1ab4471ef9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 30 deletions

View File

@ -345,18 +345,20 @@ void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
sub(sp, sp, SPOffset);
int i = 0;
// rsp capable move
add(TmpReg, aarch64::sp, 0);
if (CanUseSVE) {
for (const auto& RA : RAFPR) {
mov(TMP4, i * 8);
st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
i += 4;
for (size_t i = 0; i < RAFPR.size(); i += 4) {
const auto Reg1 = RAFPR[i];
const auto Reg2 = RAFPR[i + 1];
const auto Reg3 = RAFPR[i + 2];
const auto Reg4 = RAFPR[i + 3];
st4b(Reg1.Z().VnB(), Reg2.Z().VnB(), Reg3.Z().VnB(), Reg4.Z().VnB(), PRED_TMP_32B, SVEMemOperand(TmpReg));
add(TmpReg, TmpReg, 32 * 4);
}
str(lr, MemOperand(sp, i * 8));
} else {
// rsp capable move
add(TmpReg, aarch64::sp, 0);
for (size_t i = 0; i < RAFPR.size(); i += 4) {
const auto Reg1 = RAFPR[i];
const auto Reg2 = RAFPR[i + 1];
@ -364,29 +366,24 @@ void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
const auto Reg4 = RAFPR[i + 3];
st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
}
str(aarch64::lr, MemOperand(TmpReg, 0));
}
str(aarch64::lr, MemOperand(TmpReg, 0));
}
void Arm64Emitter::PopDynamicRegsAndLR() {
const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
: Core::CPUState::XMM_SSE_REG_SIZE;
const auto FPRSize = RAFPR.size() * FPRRegSize;
const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
int i = 0;
if (CanUseSVE) {
for (const auto& RA : RAFPR) {
mov(TMP4, i * 8);
ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
i += 4;
for (size_t i = 0; i < RAFPR.size(); i += 4) {
const auto Reg1 = RAFPR[i];
const auto Reg2 = RAFPR[i + 1];
const auto Reg3 = RAFPR[i + 2];
const auto Reg4 = RAFPR[i + 3];
ld4b(Reg1.Z().VnB(), Reg2.Z().VnB(), Reg3.Z().VnB(), Reg4.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(aarch64::sp));
add(aarch64::sp, aarch64::sp, 32 * 4);
}
ldr(lr, MemOperand(sp, i * 8));
add(sp, sp, SPOffset);
} else {
for (size_t i = 0; i < RAFPR.size(); i += 4) {
const auto Reg1 = RAFPR[i];
const auto Reg2 = RAFPR[i + 1];
@ -394,9 +391,9 @@ void Arm64Emitter::PopDynamicRegsAndLR() {
const auto Reg4 = RAFPR[i + 3];
ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(aarch64::sp, 64, PostIndex));
}
ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
}
ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
}
void Arm64Emitter::Align16B() {

View File

@ -38,12 +38,7 @@ namespace FEXCore::CPU {
using namespace vixl;
using namespace vixl::aarch64;
#ifdef VIXL_SIMULATOR
// Vixl simulator needs at least 4476 bytes for its dispatcher
constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096 * 2;
#else
constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096;
#endif
Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const DispatcherConfig &config)
: FEXCore::CPU::Dispatcher(ctx, config), Arm64Emitter(ctx, MAX_DISPATCHER_CODE_SIZE)
@ -51,7 +46,6 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
, Simulator {&Decoder}
#endif
{
#ifdef VIXL_SIMULATOR
// Hardcode a 256-bit vector width if we are running in the simulator.
Simulator.SetVectorLengthInBits(256);