Merge pull request #3380 from alyssarosenzweig/opt/pdep

Optimize PDEP
2025-03-05 04:57:12 +00:00 · 2024-01-29 13:27:15 -05:00 · 2024-01-29 13:27:15 -05:00 · 58f3d3caf5
commit 58f3d3caf5
parent 8e3d4a3e02 409b6ff6ef
4 changed files with 102 additions and 163 deletions
--- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
@ -713,64 +713,58 @@ DEF_OP(PDep) {
  LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
  const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

-  const auto Input = GetReg(Op->Input.ID());
-  const auto Mask = GetReg(Op->Mask.ID());
  const auto Dest = GetReg(Node);

-  const auto ShiftedBitReg = TMP1.R();
-  const auto BitReg        = TMP2.R();
-  const auto SubMaskReg    = TMP3.R();
-  const auto IndexReg      = TMP4.R();
-  const auto ZeroReg       = ARMEmitter::Reg::zr;
+  // PDep implementation follows the ideas from
+  // http://0x80.pl/articles/pdep-soft-emu.html ... Basically, iterate the *set*
+  // bits only, which will be faster than the naive implementation as long as
+  // there are enough holes in the mask.
+  //
+  // The specific arm64 assembly used is based on the sequence that clang
+  // generates for the C code, giving context to the scheduling yielding better
+  // ILP than I would do by hand. The registers are allocated by hand however,
+  // to fit within the tight constraints we have here withot spilling. Also, we
+  // use cbz/cbnz for conditional branching to avoid clobbering NZCV.

-  const auto InputReg = StaticRegisters[0];
-  const auto MaskReg  = StaticRegisters[1];
-  const auto DestReg  = StaticRegisters[2];
+  // We can't clobber these
+  const auto OrigInput = GetReg(Op->Input.ID());
+  const auto OrigMask = GetReg(Op->Mask.ID());

-  const auto SpillCode = 1U << InputReg.Idx() |
-                         1U << MaskReg.Idx() |
-                         1U << DestReg.Idx();
+  // So we have shadow as temporaries
+  const auto Input = TMP1.R();
+  const auto Mask  = TMP2.R();
+
+  // these get used variously as scratch
+  const auto T0    = TMP3.R();
+  const auto T1    = TMP4.R();

-  ARMEmitter::SingleUseForwardLabel EarlyExit;
  ARMEmitter::BackwardLabel NextBit;
  ARMEmitter::SingleUseForwardLabel Done;
-  cbz(EmitSize, Mask, &EarlyExit);
-  mov(EmitSize, IndexReg, ZeroReg);

-  // We sadly need to spill regs for this for the time being
-  // TODO: Remove when scratch registers can be allocated
-  //       explicitly.
-  SpillStaticRegs(TMP1, false, SpillCode);
+  // First, copy the input/mask, since we'll be clobbering. Copy as 64-bit to
+  // make this 0-uop on Firestorm.
+  mov(ARMEmitter::Size::i64Bit, Input, OrigInput);
+  mov(ARMEmitter::Size::i64Bit, Mask, OrigMask);

+  // Now, they're copied, so we can start setting Dest (even if it overlaps with
+  // one of them).  Handle early exit case
+  mov(EmitSize, Dest, 0);
+  cbz(EmitSize, OrigMask, &Done);

-  mov(EmitSize, InputReg, Input);
-  mov(EmitSize, MaskReg, Mask);
-  mov(EmitSize, DestReg, ZeroReg);
+  // Setup for first iteration
+  neg(EmitSize, T0, Mask);
+  and_(EmitSize, T0, T0, Mask);

  // Main loop
  Bind(&NextBit);
-  rbit(EmitSize, ShiftedBitReg, MaskReg);
-  clz(EmitSize, ShiftedBitReg, ShiftedBitReg);
-  lsrv(EmitSize, BitReg, InputReg, IndexReg);
-  and_(EmitSize, BitReg, BitReg, 1);
-  sub(EmitSize, SubMaskReg, MaskReg, 1);
-  add(EmitSize, IndexReg, IndexReg, 1);
-  ands(EmitSize, MaskReg, MaskReg, SubMaskReg);
-  lslv(EmitSize, ShiftedBitReg, BitReg, ShiftedBitReg);
-  orr(EmitSize, DestReg, DestReg, ShiftedBitReg);
-  b(ARMEmitter::Condition::CC_NE, &NextBit);
-  // Store result in a temp so it doesn't get clobbered.
-  // and restore it after the re-fill below.
-  mov(EmitSize, IndexReg, DestReg);
-  // Restore our registers before leaving
-  // TODO: Also remove along with above TODO.
-  FillStaticRegs(false, SpillCode);
-  mov(EmitSize, Dest, IndexReg);
-  b(&Done);
-
-  // Early exit
-  Bind(&EarlyExit);
-  mov(EmitSize, Dest, ZeroReg);
+  sbfx(EmitSize, T1, Input, 0, 1);
+  eor(EmitSize, Mask, Mask, T0);
+  and_(EmitSize, T0, T1, T0);
+  neg(EmitSize, T1, Mask);
+  orr(EmitSize, Dest, Dest, T0);
+  lsr(EmitSize, Input, Input, 1);
+  and_(EmitSize, T0, Mask, T1);
+  cbnz(EmitSize, T0, &NextBit);

  // All done with nothing to do.
  Bind(&Done);
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@ -2307,8 +2307,9 @@ void OpDispatchBuilder::MULX(OpcodeArgs) {
 }

 void OpDispatchBuilder::PDEP(OpcodeArgs) {
-  auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
-  auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);
+  LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
+  auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
+  auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
  auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask);

  StoreResult(GPRClass, Op, Op->Dest, Result, -1);
--- a/unittests/InstructionCountCI/FlagM/VEX_map2.json
+++ b/unittests/InstructionCountCI/FlagM/VEX_map2.json
@ -472,75 +472,47 @@
      ]
    },
    "pdep eax, ebx, ecx": {
-      "ExpectedInstructionCount": 29,
+      "ExpectedInstructionCount": 14,
      "Comment": [
        "Map 2 0b11 0xf5 32-bit"
      ],
      "ExpectedArm64ASM": [
-        "mov w20, w7",
-        "mov w21, w5",
-        "cbz w21, #+0x68",
-        "mov w3, wzr",
-        "mrs x0, nzcv",
-        "str w0, [x28, #728]",
-        "stp x4, x5, [x28, #8]",
-        "str x6, [x28, #24]",
-        "mov w4, w20",
-        "mov w5, w21",
-        "mov w6, wzr",
-        "rbit w0, w5",
-        "clz w0, w0",
-        "lsr w1, w4, w3",
-        "and w1, w1, #0x1",
-        "sub w2, w5, #0x1 (1)",
-        "add w3, w3, #0x1 (1)",
-        "ands w5, w5, w2",
-        "lsl w0, w1, w0",
-        "orr w6, w6, w0",
-        "b.ne #-0x24",
-        "mov w3, w6",
-        "ldr w4, [x28, #728]",
-        "msr nzcv, x4",
-        "ldp x4, x5, [x28, #8]",
-        "ldr x6, [x28, #24]",
-        "mov w4, w3",
-        "b #+0x8",
-        "mov w4, wzr"
+        "mov x0, x7",
+        "mov x1, x5",
+        "mov w4, #0x0",
+        "cbz w5, #+0x2c",
+        "neg w2, w1",
+        "and w2, w2, w1",
+        "sbfx w3, w0, #0, #1",
+        "eor w1, w1, w2",
+        "and w2, w3, w2",
+        "neg w3, w1",
+        "orr w4, w4, w2",
+        "lsr w0, w0, #1",
+        "and w2, w1, w3",
+        "cbnz w2, #-0x1c"
      ]
    },
    "pdep rax, rbx, rcx": {
-      "ExpectedInstructionCount": 27,
+      "ExpectedInstructionCount": 14,
      "Comment": [
        "Map 2 0b11 0xf5 64-bit"
      ],
      "ExpectedArm64ASM": [
-        "cbz x5, #+0x68",
-        "mov x3, xzr",
-        "mrs x0, nzcv",
-        "str w0, [x28, #728]",
-        "stp x4, x5, [x28, #8]",
-        "str x6, [x28, #24]",
-        "mov x4, x7",
-        "mov x5, x5",
-        "mov x6, xzr",
-        "rbit x0, x5",
-        "clz x0, x0",
-        "lsr x1, x4, x3",
-        "and x1, x1, #0x1",
-        "sub x2, x5, #0x1 (1)",
-        "add x3, x3, #0x1 (1)",
-        "ands x5, x5, x2",
-        "lsl x0, x1, x0",
-        "orr x6, x6, x0",
-        "b.ne #-0x24",
-        "mov x3, x6",
-        "ldr w4, [x28, #728]",
-        "msr nzcv, x4",
-        "ldp x4, x5, [x28, #8]",
-        "ldr x6, [x28, #24]",
-        "mov x4, x3",
-        "b #+0x8",
-        "mov x4, xzr"
+        "mov x0, x7",
+        "mov x1, x5",
+        "mov x4, #0x0",
+        "cbz x5, #+0x2c",
+        "neg x2, x1",
+        "and x2, x2, x1",
+        "sbfx x3, x0, #0, #1",
+        "eor x1, x1, x2",
+        "and x2, x3, x2",
+        "neg x3, x1",
+        "orr x4, x4, x2",
+        "lsr x0, x0, #1",
+        "and x2, x1, x3",
+        "cbnz x2, #-0x1c"
      ]
    },
    "bextr eax, ebx, ecx": {
--- a/unittests/InstructionCountCI/VEX_map2.json
+++ b/unittests/InstructionCountCI/VEX_map2.json
@ -3329,75 +3329,47 @@
      ]
    },
    "pdep eax, ebx, ecx": {
-      "ExpectedInstructionCount": 29,
+      "ExpectedInstructionCount": 14,
      "Comment": [
        "Map 2 0b11 0xf5 32-bit"
      ],
      "ExpectedArm64ASM": [
-        "mov w20, w7",
-        "mov w21, w5",
-        "cbz w21, #+0x68",
-        "mov w3, wzr",
-        "mrs x0, nzcv",
-        "str w0, [x28, #728]",
-        "stp x4, x5, [x28, #8]",
-        "str x6, [x28, #24]",
-        "mov w4, w20",
-        "mov w5, w21",
-        "mov w6, wzr",
-        "rbit w0, w5",
-        "clz w0, w0",
-        "lsr w1, w4, w3",
-        "and w1, w1, #0x1",
-        "sub w2, w5, #0x1 (1)",
-        "add w3, w3, #0x1 (1)",
-        "ands w5, w5, w2",
-        "lsl w0, w1, w0",
-        "orr w6, w6, w0",
-        "b.ne #-0x24",
-        "mov w3, w6",
-        "ldr w4, [x28, #728]",
-        "msr nzcv, x4",
-        "ldp x4, x5, [x28, #8]",
-        "ldr x6, [x28, #24]",
-        "mov w4, w3",
-        "b #+0x8",
-        "mov w4, wzr"
+        "mov x0, x7",
+        "mov x1, x5",
+        "mov w4, #0x0",
+        "cbz w5, #+0x2c",
+        "neg w2, w1",
+        "and w2, w2, w1",
+        "sbfx w3, w0, #0, #1",
+        "eor w1, w1, w2",
+        "and w2, w3, w2",
+        "neg w3, w1",
+        "orr w4, w4, w2",
+        "lsr w0, w0, #1",
+        "and w2, w1, w3",
+        "cbnz w2, #-0x1c"
      ]
    },
    "pdep rax, rbx, rcx": {
-      "ExpectedInstructionCount": 27,
+      "ExpectedInstructionCount": 14,
      "Comment": [
        "Map 2 0b11 0xf5 64-bit"
      ],
      "ExpectedArm64ASM": [
-        "cbz x5, #+0x68",
-        "mov x3, xzr",
-        "mrs x0, nzcv",
-        "str w0, [x28, #728]",
-        "stp x4, x5, [x28, #8]",
-        "str x6, [x28, #24]",
-        "mov x4, x7",
-        "mov x5, x5",
-        "mov x6, xzr",
-        "rbit x0, x5",
-        "clz x0, x0",
-        "lsr x1, x4, x3",
-        "and x1, x1, #0x1",
-        "sub x2, x5, #0x1 (1)",
-        "add x3, x3, #0x1 (1)",
-        "ands x5, x5, x2",
-        "lsl x0, x1, x0",
-        "orr x6, x6, x0",
-        "b.ne #-0x24",
-        "mov x3, x6",
-        "ldr w4, [x28, #728]",
-        "msr nzcv, x4",
-        "ldp x4, x5, [x28, #8]",
-        "ldr x6, [x28, #24]",
-        "mov x4, x3",
-        "b #+0x8",
-        "mov x4, xzr"
+        "mov x0, x7",
+        "mov x1, x5",
+        "mov x4, #0x0",
+        "cbz x5, #+0x2c",
+        "neg x2, x1",
+        "and x2, x2, x1",
+        "sbfx x3, x0, #0, #1",
+        "eor x1, x1, x2",
+        "and x2, x3, x2",
+        "neg x3, x1",
+        "orr x4, x4, x2",
+        "lsr x0, x0, #1",
+        "and x2, x1, x3",
+        "cbnz x2, #-0x1c"
      ]
    },
    "mulx eax, ebx, ecx": {