OpcodeDispatcher: Remove unnecessary moves in {AVX}VectorUnaryOp

When dealing with source vectors, we can use the vector length rather than using a smaller size and zero extending the register, especially since the resulting value is just inserted into another vector.
2025-02-12 18:39:18 +00:00 · 2023-08-19 19:41:31 -04:00 · 2023-08-19 19:41:31 -04:00 · 09addb217a
commit 09addb217a
parent db60a2fd4b
4 changed files with 28 additions and 38 deletions
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -651,13 +651,18 @@ template
 void OpDispatchBuilder::AVXVectorScalarALUOp<IR::OP_VFSUB, 8>(OpcodeArgs);

 void OpDispatchBuilder::VectorUnaryOpImpl(OpcodeArgs, IROps IROp, size_t ElementSize, bool Scalar) {
-  const auto Size = Scalar ? ElementSize : GetSrcSize(Op);
+  // In the event of a scalar operation and a vector source, then
+  // we can specify the entire vector length in order to avoid
+  // unnecessary sign extension on the element to be operated on.
+  // In the event of a memory operand, we load the exact element size.
+  const auto SrcSize = Scalar && Op->Src[0].IsGPR() ? 16U : GetSrcSize(Op);
+  const auto OpSize = Scalar ? ElementSize : GetSrcSize(Op);
  const auto DstSize = GetDstSize(Op);

-  OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
+  OrderedNode *Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, -1);
  OrderedNode *Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags, -1);

-  auto ALUOp = _VFSqrt(Size, ElementSize, Src);
+  auto ALUOp = _VFSqrt(OpSize, ElementSize, Src);
  // Overwrite our IR's op type
  ALUOp.first->Header.Op = IROp;

@ -702,19 +707,24 @@ template
 void OpDispatchBuilder::VectorUnaryOp<IR::OP_VABS, 4, false>(OpcodeArgs);

 void OpDispatchBuilder::AVXVectorUnaryOpImpl(OpcodeArgs, IROps IROp, size_t ElementSize, bool Scalar) {
-  const auto Size = Scalar ? ElementSize : GetSrcSize(Op);
+  // In the event of a scalar operation and a vector source, then
+  // we can specify the entire vector length in order to avoid
+  // unnecessary sign extension on the element to be operated on.
+  // In the event of a memory operand, we load the exact element size.
+  const auto SrcSize = Scalar && Op->Src[1].IsGPR() ? 16U : GetSrcSize(Op);
+  const auto OpSize = Scalar ? ElementSize : GetSrcSize(Op);
  const auto DstSize = GetDstSize(Op);

  OrderedNode *Src = [&] {
    const auto SrcIndex = Scalar ? 1 : 0;
-    return LoadSource(FPRClass, Op, Op->Src[SrcIndex], Op->Flags, -1);
+    return LoadSource_WithOpSize(FPRClass, Op, Op->Src[SrcIndex], SrcSize, Op->Flags, -1);
  }();
  OrderedNode *Dest = [&] {
    const auto& Operand = Scalar ? Op->Src[0] : Op->Dest;
    return LoadSource_WithOpSize(FPRClass, Op, Operand, DstSize, Op->Flags, -1);
  }();

-  auto ALUOp = _VFSqrt(Size, ElementSize, Src);
+  auto ALUOp = _VFSqrt(OpSize, ElementSize, Src);
  // Overwrite our IR's op type
  ALUOp.first->Header.Op = IROp;

--- a/unittests/InstructionCountCI/Secondary_REP.json
+++ b/unittests/InstructionCountCI/Secondary_REP.json
@ -206,47 +206,38 @@
      ]
    },
    "sqrtss xmm0, xmm1": {
-      "ExpectedInstructionCount": 5,
+      "ExpectedInstructionCount": 2,
      "Optimal": "No",
      "Comment": "0xf3 0x0f 0x51",
      "ExpectedArm64ASM": [
-        "movi v0.2d, #0x0",
-        "mov v0.s[0], v17.s[0]",
-        "mov v4.16b, v0.16b",
-        "fsqrt s4, s4",
+        "fsqrt s4, s17",
        "mov v16.s[0], v4.s[0]"
      ]
    },
    "rsqrtss xmm0, xmm1": {
-      "ExpectedInstructionCount": 7,
+      "ExpectedInstructionCount": 4,
      "Optimal": "No",
      "Comment": [
        "FEAT_FPRES could make this more optimal",
        "0xf3 0x0f 0x52"
      ],
      "ExpectedArm64ASM": [
-        "movi v0.2d, #0x0",
-        "mov v0.s[0], v17.s[0]",
-        "mov v4.16b, v0.16b",
        "fmov s0, #0x70 (1.0000)",
-        "fsqrt s1, s4",
+        "fsqrt s1, s17",
        "fdiv s4, s0, s1",
        "mov v16.s[0], v4.s[0]"
      ]
    },
    "rcpss xmm0, xmm1": {
-      "ExpectedInstructionCount": 6,
+      "ExpectedInstructionCount": 3,
      "Optimal": "No",
      "Comment": [
        "FEAT_FPRES could make this more optimal",
        "0xf3 0x0f 0x53"
      ],
      "ExpectedArm64ASM": [
-        "movi v0.2d, #0x0",
-        "mov v0.s[0], v17.s[0]",
-        "mov v4.16b, v0.16b",
        "fmov s0, #0x70 (1.0000)",
-        "fdiv s4, s0, s4",
+        "fdiv s4, s0, s17",
        "mov v16.s[0], v4.s[0]"
      ]
    },
--- a/unittests/InstructionCountCI/Secondary_REPNE.json
+++ b/unittests/InstructionCountCI/Secondary_REPNE.json
@ -187,15 +187,14 @@
      ]
    },
    "sqrtsd xmm0, xmm1": {
-      "ExpectedInstructionCount": 3,
+      "ExpectedInstructionCount": 2,
      "Optimal": "No",
      "Comment": [
        "With AFP mode FEX can remove an insert after the operation.",
        "0xf2 0x0f 0x51"
      ],
      "ExpectedArm64ASM": [
-        "mov v4.8b, v17.8b",
-        "fsqrt d4, d4",
+        "fsqrt d4, d17",
        "mov v16.d[0], v4.d[0]"
      ]
    },
--- a/unittests/InstructionCountCI/VEX_map1.json
+++ b/unittests/InstructionCountCI/VEX_map1.json
@ -729,7 +729,7 @@
      ]
    },
    "vsqrtss xmm0, xmm1, xmm2": {
-      "ExpectedInstructionCount": 11,
+      "ExpectedInstructionCount": 8,
      "Optimal": "No",
      "Comment": [
        "Insert in to first element could be more optimal, which is the common case.",
@ -737,9 +737,6 @@
      ],
      "ExpectedArm64ASM": [
        "mov z4.d, p7/m, z18.d",
-        "movi v0.2d, #0x0",
-        "mov v0.s[0], v4.s[0]",
-        "mov v4.16b, v0.16b",
        "mov z5.d, p7/m, z17.d",
        "fsqrt s4, s4",
        "mov v0.16b, v5.16b",
@ -750,7 +747,7 @@
      ]
    },
    "vsqrtsd xmm0, xmm1, xmm2": {
-      "ExpectedInstructionCount": 9,
+      "ExpectedInstructionCount": 8,
      "Optimal": "No",
      "Comment": [
        "Insert in to first element could be more optimal, which is the common case.",
@ -758,7 +755,6 @@
      ],
      "ExpectedArm64ASM": [
        "mov z4.d, p7/m, z18.d",
-        "mov v4.8b, v4.8b",
        "mov z5.d, p7/m, z17.d",
        "fsqrt d4, d4",
        "mov v0.16b, v5.16b",
@ -800,7 +796,7 @@
      ]
    },
    "vrsqrtss xmm0, xmm1, xmm2": {
-      "ExpectedInstructionCount": 13,
+      "ExpectedInstructionCount": 10,
      "Optimal": "No",
      "Comment": [
        "FEAT_FPRES could make this more optimal",
@ -808,9 +804,6 @@
      ],
      "ExpectedArm64ASM": [
        "mov z4.d, p7/m, z18.d",
-        "movi v0.2d, #0x0",
-        "mov v0.s[0], v4.s[0]",
-        "mov v4.16b, v0.16b",
        "mov z5.d, p7/m, z17.d",
        "fmov s0, #0x70 (1.0000)",
        "fsqrt s1, s4",
@ -853,7 +846,7 @@
      ]
    },
    "vrcpss xmm0, xmm1, xmm2": {
-      "ExpectedInstructionCount": 12,
+      "ExpectedInstructionCount": 9,
      "Optimal": "No",
      "Comment": [
        "FEAT_FPRES could make this more optimal",
@ -861,9 +854,6 @@
      ],
      "ExpectedArm64ASM": [
        "mov z4.d, p7/m, z18.d",
-        "movi v0.2d, #0x0",
-        "mov v0.s[0], v4.s[0]",
-        "mov v4.16b, v0.16b",
        "mov z5.d, p7/m, z17.d",
        "fmov s0, #0x70 (1.0000)",
        "fdiv s4, s0, s4",