Arm64: Stop abusing orr in LoadConstant

The current implementation uses orr excessively. This has FEX missing hardware optimization opportunities where some CPU cores will zero-cycle move constants that fit in to the 16-bits of movz/movk. First evaluate up front if the number of 16-bit segments is > 1, in those cases we should check if it is a bitfield that can be moved in one instruction with orr. After that point we will use movz for 16-bit constant moves. Additionally this optimizes the case where a constant of zero is loaded to be a `mov <reg>, zr` which gets renamed in most hardware.
2025-01-31 11:32:07 +00:00 · 2023-08-16 16:49:46 -07:00 · 2023-08-16 16:49:46 -07:00 · 23fd79a3b3
commit 23fd79a3b3
parent 6cb0f52e94
1 changed files with 36 additions and 15 deletions
--- a/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
+++ b/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@ -213,17 +213,6 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
    Segments = 2;
  }

-  // If this can be loaded with a mov bitmask.
-  const auto IsImm = vixl::aarch64::Assembler::IsImmLogical(Constant, RegSizeInBits(s));
-  if (IsImm) {
-    orr(s, Reg, ARMEmitter::Reg::zr, Constant);
-    if (NOPPad) {
-      nop(); nop(); nop();
-    }
-    return;
-  }
-
-  int NumMoves = 1;
  int RequiredMoveSegments{};

  // Count the number of move segments
@ -235,6 +224,20 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
    }
  }

+  // If this can be loaded with a mov bitmask.
+  if (RequiredMoveSegments > 1) {
+    // Only try to use this path if the number of segments is > 1.
+    // `movz` is better than `orr` since hardware will rename or merge if possible when `movz` is used.
+    const auto IsImm = vixl::aarch64::Assembler::IsImmLogical(Constant, RegSizeInBits(s));
+    if (IsImm) {
+      orr(s, Reg, ARMEmitter::Reg::zr, Constant);
+      if (NOPPad) {
+        nop(); nop(); nop();
+      }
+      return;
+    }
+  }
+
  // ADRP+ADD is specifically optimized in hardware
  // Check if we can use this
  auto PC = GetCursorAddress<uint64_t>();
@ -245,6 +248,8 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
  // Offset from aligned PC
  int64_t AlignedOffset = static_cast<int64_t>(Constant) - static_cast<int64_t>(AlignedPC);

+  int NumMoves = 0;
+
  // If the aligned offset is within the 4GB window then we can use ADRP+ADD
  // and the number of move segments more than 1
  if (RequiredMoveSegments > 1 && vixl::IsInt32(AlignedOffset)) {
@ -268,14 +273,30 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
    }
  }
  else {
-    movz(s, Reg, (Constant) & 0xFFFF, 0);
-    for (int i = 1; i < Segments; ++i) {
-      uint16_t Part = (Constant >> (i * 16)) & 0xFFFF;
+    int CurrentSegment = 0;
+    for (; CurrentSegment < Segments; ++CurrentSegment) {
+      uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF;
      if (Part) {
-        movk(s, Reg, Part, i * 16);
+        movz(s, Reg, Part, CurrentSegment * 16);
+        ++CurrentSegment;
+        ++NumMoves;
+        break;
+      }
+    }
+
+    for (; CurrentSegment < Segments; ++CurrentSegment) {
+      uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF;
+      if (Part) {
+        movk(s, Reg, Part, CurrentSegment * 16);
        ++NumMoves;
      }
    }
+
+    if (NumMoves == 0) {
+      // If we didn't move anything that means this is a zero move. Special case this.
+      movz(s, Reg, 0);
+      ++NumMoves;
+    }
  }

  if (NOPPad) {