mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-02-21 15:22:26 +00:00
Arm64: Optimize SVE register spilling and filling
Causes the dispatcher to drop from 4476 bytes down to 3900 for SVE-256bit supporting targets. This is done by significantly reducing SVE loadstore ops. Going from 8 instructions per 4 registers, down to 2 instructions. This is done by switching from 1 register loadstore instructions up to 4 register loadstore instructions. Which should significantly improve performance on future SVE platforms. Filling and Spilling to the context is still using the old code path because SVE doesn't offer non-interleaving loadstores. Spilling and filling on the stack is fine because we don't need to match context state.
This commit is contained in:
parent
58fab721b3
commit
40e073c8b2
@ -345,18 +345,20 @@ void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
|
||||
const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
|
||||
|
||||
sub(sp, sp, SPOffset);
|
||||
int i = 0;
|
||||
|
||||
// rsp capable move
|
||||
add(TmpReg, aarch64::sp, 0);
|
||||
|
||||
if (CanUseSVE) {
|
||||
for (const auto& RA : RAFPR) {
|
||||
mov(TMP4, i * 8);
|
||||
st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
|
||||
i += 4;
|
||||
for (size_t i = 0; i < RAFPR.size(); i += 4) {
|
||||
const auto Reg1 = RAFPR[i];
|
||||
const auto Reg2 = RAFPR[i + 1];
|
||||
const auto Reg3 = RAFPR[i + 2];
|
||||
const auto Reg4 = RAFPR[i + 3];
|
||||
st4b(Reg1.Z().VnB(), Reg2.Z().VnB(), Reg3.Z().VnB(), Reg4.Z().VnB(), PRED_TMP_32B, SVEMemOperand(TmpReg));
|
||||
add(TmpReg, TmpReg, 32 * 4);
|
||||
}
|
||||
str(lr, MemOperand(sp, i * 8));
|
||||
} else {
|
||||
// rsp capable move
|
||||
add(TmpReg, aarch64::sp, 0);
|
||||
for (size_t i = 0; i < RAFPR.size(); i += 4) {
|
||||
const auto Reg1 = RAFPR[i];
|
||||
const auto Reg2 = RAFPR[i + 1];
|
||||
@ -364,29 +366,24 @@ void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
|
||||
const auto Reg4 = RAFPR[i + 3];
|
||||
st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
|
||||
}
|
||||
str(aarch64::lr, MemOperand(TmpReg, 0));
|
||||
}
|
||||
|
||||
str(aarch64::lr, MemOperand(TmpReg, 0));
|
||||
}
|
||||
|
||||
void Arm64Emitter::PopDynamicRegsAndLR() {
|
||||
const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
|
||||
const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
|
||||
const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
|
||||
: Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
const auto FPRSize = RAFPR.size() * FPRRegSize;
|
||||
const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
|
||||
int i = 0;
|
||||
|
||||
if (CanUseSVE) {
|
||||
for (const auto& RA : RAFPR) {
|
||||
mov(TMP4, i * 8);
|
||||
ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
|
||||
i += 4;
|
||||
for (size_t i = 0; i < RAFPR.size(); i += 4) {
|
||||
const auto Reg1 = RAFPR[i];
|
||||
const auto Reg2 = RAFPR[i + 1];
|
||||
const auto Reg3 = RAFPR[i + 2];
|
||||
const auto Reg4 = RAFPR[i + 3];
|
||||
ld4b(Reg1.Z().VnB(), Reg2.Z().VnB(), Reg3.Z().VnB(), Reg4.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(aarch64::sp));
|
||||
add(aarch64::sp, aarch64::sp, 32 * 4);
|
||||
}
|
||||
ldr(lr, MemOperand(sp, i * 8));
|
||||
add(sp, sp, SPOffset);
|
||||
} else {
|
||||
|
||||
for (size_t i = 0; i < RAFPR.size(); i += 4) {
|
||||
const auto Reg1 = RAFPR[i];
|
||||
const auto Reg2 = RAFPR[i + 1];
|
||||
@ -394,9 +391,9 @@ void Arm64Emitter::PopDynamicRegsAndLR() {
|
||||
const auto Reg4 = RAFPR[i + 3];
|
||||
ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(aarch64::sp, 64, PostIndex));
|
||||
}
|
||||
|
||||
ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
|
||||
}
|
||||
|
||||
ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
|
||||
}
|
||||
|
||||
void Arm64Emitter::Align16B() {
|
||||
|
@ -38,12 +38,7 @@ namespace FEXCore::CPU {
|
||||
using namespace vixl;
|
||||
using namespace vixl::aarch64;
|
||||
|
||||
#ifdef VIXL_SIMULATOR
|
||||
// Vixl simulator needs at least 4476 bytes for its dispatcher
|
||||
constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096 * 2;
|
||||
#else
|
||||
constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096;
|
||||
#endif
|
||||
|
||||
Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const DispatcherConfig &config)
|
||||
: FEXCore::CPU::Dispatcher(ctx, config), Arm64Emitter(ctx, MAX_DISPATCHER_CODE_SIZE)
|
||||
@ -51,7 +46,6 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
|
||||
, Simulator {&Decoder}
|
||||
#endif
|
||||
{
|
||||
|
||||
#ifdef VIXL_SIMULATOR
|
||||
// Hardcode a 256-bit vector width if we are running in the simulator.
|
||||
Simulator.SetVectorLengthInBits(256);
|
||||
|
Loading…
x
Reference in New Issue
Block a user