Arm64: Optimize SVE register spilling and filling

Causes the dispatcher to drop from 4476 bytes down to 3900 for SVE-256bit supporting targets. This is done by significantly reducing SVE loadstore ops. Going from 8 instructions per 4 registers, down to 2 instructions. This is done by switching from 1 register loadstore instructions up to 4 register loadstore instructions. Which should significantly improve performance on future SVE platforms. Filling and Spilling to the context is still using the old code path because SVE doesn't offer non-interleaving loadstores. Spilling and filling on the stack is fine because we don't need to match context state.
2025-02-21 15:22:26 +00:00 · 2022-12-15 23:52:11 -08:00 · 2022-12-15 23:52:11 -08:00 · 40e073c8b2
commit 40e073c8b2
parent 58fab721b3
2 changed files with 21 additions and 30 deletions
--- a/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
+++ b/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@ -345,18 +345,20 @@ void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
  const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);

  sub(sp, sp, SPOffset);
-  int i = 0;
+
+  // rsp capable move
+  add(TmpReg, aarch64::sp, 0);

  if (CanUseSVE) {
-    for (const auto& RA : RAFPR) {
-      mov(TMP4, i * 8);
-      st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
-      i += 4;
+    for (size_t i = 0; i < RAFPR.size(); i += 4) {
+      const auto Reg1 = RAFPR[i];
+      const auto Reg2 = RAFPR[i + 1];
+      const auto Reg3 = RAFPR[i + 2];
+      const auto Reg4 = RAFPR[i + 3];
+      st4b(Reg1.Z().VnB(), Reg2.Z().VnB(), Reg3.Z().VnB(), Reg4.Z().VnB(), PRED_TMP_32B, SVEMemOperand(TmpReg));
+      add(TmpReg, TmpReg, 32 * 4);
    }
-    str(lr, MemOperand(sp, i * 8));
  } else {
-    // rsp capable move
-    add(TmpReg, aarch64::sp, 0);
    for (size_t i = 0; i < RAFPR.size(); i += 4) {
      const auto Reg1 = RAFPR[i];
      const auto Reg2 = RAFPR[i + 1];
@ -364,29 +366,24 @@ void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
      const auto Reg4 = RAFPR[i + 3];
      st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
    }
-    str(aarch64::lr, MemOperand(TmpReg, 0));
  }
+
+  str(aarch64::lr, MemOperand(TmpReg, 0));
 }

 void Arm64Emitter::PopDynamicRegsAndLR() {
  const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
-  const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
-  const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
-                                    : Core::CPUState::XMM_SSE_REG_SIZE;
-  const auto FPRSize = RAFPR.size() * FPRRegSize;
-  const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16);
-  int i = 0;

  if (CanUseSVE) {
-    for (const auto& RA : RAFPR) {
-      mov(TMP4, i * 8);
-      ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
-      i += 4;
+    for (size_t i = 0; i < RAFPR.size(); i += 4) {
+      const auto Reg1 = RAFPR[i];
+      const auto Reg2 = RAFPR[i + 1];
+      const auto Reg3 = RAFPR[i + 2];
+      const auto Reg4 = RAFPR[i + 3];
+      ld4b(Reg1.Z().VnB(), Reg2.Z().VnB(), Reg3.Z().VnB(), Reg4.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(aarch64::sp));
+      add(aarch64::sp, aarch64::sp, 32 * 4);
    }
-    ldr(lr, MemOperand(sp, i * 8));
-    add(sp, sp, SPOffset);
  } else {
-
    for (size_t i = 0; i < RAFPR.size(); i += 4) {
      const auto Reg1 = RAFPR[i];
      const auto Reg2 = RAFPR[i + 1];
@ -394,9 +391,9 @@ void Arm64Emitter::PopDynamicRegsAndLR() {
      const auto Reg4 = RAFPR[i + 3];
      ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(aarch64::sp, 64, PostIndex));
    }
-
-    ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
  }
+
+  ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
 }

 void Arm64Emitter::Align16B() {
--- a/External/FEXCore/Source/Interface/Core/Dispatcher/Arm64Dispatcher.cpp
+++ b/External/FEXCore/Source/Interface/Core/Dispatcher/Arm64Dispatcher.cpp
@ -38,12 +38,7 @@ namespace FEXCore::CPU {
 using namespace vixl;
 using namespace vixl::aarch64;

-#ifdef VIXL_SIMULATOR
-// Vixl simulator needs at least 4476 bytes for its dispatcher
-constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096 * 2;
-#else
 constexpr size_t MAX_DISPATCHER_CODE_SIZE = 4096;
-#endif

 Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const DispatcherConfig &config)
  : FEXCore::CPU::Dispatcher(ctx, config), Arm64Emitter(ctx, MAX_DISPATCHER_CODE_SIZE)
@ -51,7 +46,6 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
  , Simulator {&Decoder}
 #endif
 {
-
 #ifdef VIXL_SIMULATOR
  // Hardcode a 256-bit vector width if we are running in the simulator.
  Simulator.SetVectorLengthInBits(256);