Merge pull request #3692 from pmatos/AFP_RPRES_fix

Fixes AFP.NEP handling on scalar insertions
2025-01-31 19:42:54 +00:00 · 2024-06-19 19:23:49 -07:00 · 2024-06-19 19:23:49 -07:00 · da21ee3cda
commit da21ee3cda
parent d2baef2b36 9acd325aa4
11 changed files with 95 additions and 68 deletions
--- a/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp
@ -224,16 +224,18 @@ DEF_FBINOP_SCALAR_INSERT(VFSubScalarInsert, fsub)
 DEF_FBINOP_SCALAR_INSERT(VFMulScalarInsert, fmul)
 DEF_FBINOP_SCALAR_INSERT(VFDivScalarInsert, fdiv)

+
+// VFScalarOperation performs the operation described through ScalarEmit between Vector1 and Vector2,
+// storing it into Dst. This is a scalar operation, so the only lowest element of each vector is used for the operation.
+// The result is stored into the destination. The untouched bits of the destination come from Vector1, unless it's a 256 vector
+// and ZeroUpperBits is true, in which case the upper bits are zero.
 void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
                                     ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2) {
  const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
-  if (!Is256Bit) {
-    LOGMAN_THROW_A_FMT(ZeroUpperBits == false, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);
-  }
+  LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);

  // Bit of a tricky detail.
-  // The upper bits of the destination comes from the first source.
-
+  // The upper bits of the destination comes from Vector1.
  LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size");
  const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
                                                       ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit :
@ -261,8 +263,8 @@ void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool Z
        ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
      }
    }
-  } else if (Dst != Vector2) {
-    if (!ZeroUpperBits && Is256Bit) {
+  } else if (Dst != Vector2) { // Dst different from both Vector1 and Vector2
+    if (Is256Bit && !ZeroUpperBits) {
      mov(Dst.Z(), Vector1.Z());
    } else {
      mov(Dst.Q(), Vector1.Q());
@ -279,36 +281,30 @@ void Arm64JITCore::VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool Z
        ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
      }
    }
-  } else {
-    // Destination intersects Vector2, can't do anything optimal in this case.
-    // Do the scalar operation first and then move and insert.
+  } else { // Dst same as Vector2
+
    ScalarEmit(VTMP1, Vector1, Vector2);

    if (!ZeroUpperBits && Is256Bit) {
      mov(Dst.Z(), Vector1.Z());
-    } else {
-      mov(Dst.Q(), Vector1.Q());
-    }
-
-    if (!ZeroUpperBits && Is256Bit) {
      ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1);
      mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z());
    } else {
+      mov(Dst.Q(), Vector1.Q());
      ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
    }
  }
 }

+// Similarly to VFScalarOperation it performs the operation described through ScalarEmit operating on Vector2.
+// However the result of the scalar operation is inserted into Vector1 and moved to Destination.
+// The untouched bits of the destination come from Vector1, unless it's a 256 vector
+// and ZeroUpperBits is true, in which case the upper bits are zero.
 void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit,
                                          ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1,
                                          std::variant<ARMEmitter::VRegister, ARMEmitter::Register> Vector2) {
  const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
-  if (!Is256Bit) {
-    LOGMAN_THROW_A_FMT(ZeroUpperBits == false, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);
-  }
-
-  // Bit of a tricky detail.
-  // The upper bits of the destination comes from the first source.
+  LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__);

  LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size");
  const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
@ -327,7 +323,7 @@ void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, b
      mov(Dst.Q(), Vector1.Q());
    }

-    if (HostSupportsAFP) {
+    if (HostSupportsAFP) { // or Dst (here Dst == Vector1)
      // If the host CPU supports AFP then scalar does an insert without modifying upper bits.
      ScalarEmit(Dst, Vector2);
    } else {
@ -366,14 +362,10 @@ void Arm64JITCore::VFScalarUnaryOperation(uint8_t OpSize, uint8_t ElementSize, b

    if (!ZeroUpperBits && Is256Bit) {
      mov(Dst.Z(), Vector1.Z());
-    } else {
-      mov(Dst.Q(), Vector1.Q());
-    }
-
-    if (!ZeroUpperBits && Is256Bit) {
      ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1);
      mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z());
    } else {
+      mov(Dst.Q(), Vector1.Q());
      ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
    }
  }
@ -457,12 +449,17 @@ DEF_OP(VFRSqrtScalarInsert) {

    fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
    fsqrt(SubRegSize.Scalar, VTMP2, Src);
-    fdiv(SubRegSize.Scalar, Dst, VTMP1, VTMP2);
+    if (HostSupportsAFP) {
+      fdiv(SubRegSize.Scalar, VTMP1, VTMP1, VTMP2);
+      ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
+    } else {
+      fdiv(SubRegSize.Scalar, Dst, VTMP1, VTMP2);
+    }
  };

  auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar) {
    auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
-    frsqrte(SubRegSize.Scalar, Dst.S(), Src.S());
+    frsqrte(SubRegSize.Scalar, Dst.D(), Src.D());
  };

  std::array<ScalarUnaryOpCaller, 2> Handlers = {
@ -590,7 +587,28 @@ DEF_OP(VSToFVectorInsert) {
  // Claim the element size is 8-bytes.
  // Might be scalar 8-byte (cvtsi2ss xmm0, rax)
  // Might be vector i32v2 (cvtpi2ps xmm0, mm0)
-  VFScalarUnaryOperation(IROp->Size, ElementSize * (HasTwoElements ? 2 : 1), Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2);
+  if (!HasTwoElements) {
+    VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2);
+    return;
+  }
+
+  // Dealing with the odd case of this being actually a vector operation rather than scalar.
+  const auto Is256Bit = IROp->Size == Core::CPUState::XMM_AVX_REG_SIZE;
+  constexpr auto Predicate = ARMEmitter::PReg::p0;
+
+  ScalarEmit(VTMP1, Vector2);
+  if (!Op->ZeroUpperBits && Is256Bit) {
+    if (Dst != Vector1) {
+      mov(Dst.Z(), Vector1.Z());
+    }
+    ptrue(ARMEmitter::SubRegSize::i64Bit, Predicate, ARMEmitter::PredicatePattern::SVE_VL1);
+    mov(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Predicate.Merging(), VTMP1.Z());
+  } else {
+    if (Dst != Vector1) {
+      mov(Dst.Q(), Vector1.Q());
+    }
+    ins(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0, VTMP1.Q(), 0);
+  }
 }

 DEF_OP(VSToFGPRInsert) {
@ -679,11 +697,11 @@ DEF_OP(VFCMPScalarInsert) {
  auto ScalarEmitEQ = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) {
    switch (SubRegSize.Scalar) {
    case ARMEmitter::ScalarRegSize::i16Bit: {
-      fcmeq(Dst.H(), Src1.H(), Src2.H());
+      fcmeq(Dst.H(), Src2.H(), Src1.H());
      break;
    }
    case ARMEmitter::ScalarRegSize::i32Bit:
-    case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Src1, Src2); break;
+    case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Src2, Src1); break;
    default: break;
    }
  };
@ -748,11 +766,11 @@ DEF_OP(VFCMPScalarInsert) {
    [this, SubRegSize, ZeroUpperBits, Is256Bit](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) {
    switch (SubRegSize.Scalar) {
    case ARMEmitter::ScalarRegSize::i16Bit: {
-      fcmeq(VTMP1.H(), Src1.H(), Src2.H());
+      fcmeq(VTMP1.H(), Src2.H(), Src1.H());
      break;
    }
    case ARMEmitter::ScalarRegSize::i32Bit:
-    case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, VTMP1, Src1, Src2); break;
+    case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, VTMP1, Src2, Src1); break;
    default: break;
    }
    // If the destination is a temporary then it is going to do an insert after the operation.
--- a/unittests/InstructionCountCI/AFP/SVE256/Secondary.json
+++ b/unittests/InstructionCountCI/AFP/SVE256/Secondary.json
@ -10,23 +10,27 @@
  },
  "Instructions": {
    "cvtpi2ps xmm0, [rax]": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 4,
      "Comment": [
        "0x0f 0x2a"
      ],
      "ExpectedArm64ASM": [
        "ldr d2, [x4]",
-        "scvtf v16.2s, v2.2s"
+        "scvtf v0.2s, v2.2s",
+        "ptrue p0.d, vl1",
+        "mov z16.d, p0/m, z0.d"
      ]
    },
    "cvtpi2ps xmm0, mm0": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 4,
      "Comment": [
        "0x0f 0x2a"
      ],
      "ExpectedArm64ASM": [
        "ldr d2, [x28, #1040]",
-        "scvtf v16.2s, v2.2s"
+        "scvtf v0.2s, v2.2s",
+        "ptrue p0.d, vl1",
+        "mov z16.d, p0/m, z0.d"
      ]
    }
  }
--- a/unittests/InstructionCountCI/AFP/SVE256/Secondary_REP.json
+++ b/unittests/InstructionCountCI/AFP/SVE256/Secondary_REP.json
@ -46,7 +46,7 @@
      ]
    },
    "rsqrtss xmm0, xmm1": {
-      "ExpectedInstructionCount": 3,
+      "ExpectedInstructionCount": 4,
      "Comment": [
        "FEAT_FPRES could make this more optimal",
        "0xf3 0x0f 0x52"
@ -54,7 +54,8 @@
      "ExpectedArm64ASM": [
        "fmov s0, #0x70 (1.0000)",
        "fsqrt s1, s17",
-        "fdiv s16, s0, s1"
+        "fdiv s0, s0, s1",
+        "mov v16.s[0], v0.s[0]"
      ]
    },
    "rcpss xmm0, xmm1": {
@ -143,7 +144,7 @@
        "0xf3 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq s16, s16, s17"
+        "fcmeq s16, s17, s16"
      ]
    },
    "cmpss xmm0, xmm1, 1": {
@ -184,7 +185,7 @@
        "0xf3 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq s0, s16, s17",
+        "fcmeq s0, s17, s16",
        "mvn v0.8b, v0.8b",
        "ptrue p0.s, vl1",
        "mov z16.s, p0/m, z0.s"
--- a/unittests/InstructionCountCI/AFP/SVE256/Secondary_REPNE.json
+++ b/unittests/InstructionCountCI/AFP/SVE256/Secondary_REPNE.json
@ -135,7 +135,7 @@
        "0xf2 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq d16, d16, d17"
+        "fcmeq d16, d17, d16"
      ]
    },
    "cmpsd xmm0, xmm1, 1": {
@ -176,7 +176,7 @@
        "0xf2 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq d0, d16, d17",
+        "fcmeq d0, d17, d16",
        "mvn v0.8b, v0.8b",
        "ptrue p0.d, vl1",
        "mov z16.d, p0/m, z0.d"
--- a/unittests/InstructionCountCI/AFP/Secondary.json
+++ b/unittests/InstructionCountCI/AFP/Secondary.json
@ -11,23 +11,25 @@
  },
  "Instructions": {
    "cvtpi2ps xmm0, [rax]": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 3,
      "Comment": [
        "0x0f 0x2a"
      ],
      "ExpectedArm64ASM": [
        "ldr d2, [x4]",
-        "scvtf v16.2s, v2.2s"
+        "scvtf v0.2s, v2.2s",
+        "mov v16.d[0], v0.d[0]"
      ]
    },
    "cvtpi2ps xmm0, mm0": {
-      "ExpectedInstructionCount": 2,
+      "ExpectedInstructionCount": 3,
      "Comment": [
        "0x0f 0x2a"
      ],
      "ExpectedArm64ASM": [
        "ldr d2, [x28, #1040]",
-        "scvtf v16.2s, v2.2s"
+        "scvtf v0.2s, v2.2s",
+        "mov v16.d[0], v0.d[0]"
      ]
    }
  }
--- a/unittests/InstructionCountCI/AFP/Secondary_REP.json
+++ b/unittests/InstructionCountCI/AFP/Secondary_REP.json
@ -47,7 +47,7 @@
      ]
    },
    "rsqrtss xmm0, xmm1": {
-      "ExpectedInstructionCount": 3,
+      "ExpectedInstructionCount": 4,
      "Comment": [
        "FEAT_FPRES could make this more optimal",
        "0xf3 0x0f 0x52"
@ -55,7 +55,8 @@
      "ExpectedArm64ASM": [
        "fmov s0, #0x70 (1.0000)",
        "fsqrt s1, s17",
-        "fdiv s16, s0, s1"
+        "fdiv s0, s0, s1",
+        "mov v16.s[0], v0.s[0]"
      ]
    },
    "rcpss xmm0, xmm1": {
@ -144,7 +145,7 @@
        "0xf3 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq s16, s16, s17"
+        "fcmeq s16, s17, s16"
      ]
    },
    "cmpss xmm0, xmm1, 1": {
@ -184,7 +185,7 @@
        "0xf3 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq s0, s16, s17",
+        "fcmeq s0, s17, s16",
        "mvn v0.8b, v0.8b",
        "mov v16.s[0], v0.s[0]"
      ]
--- a/unittests/InstructionCountCI/AFP/Secondary_REPNE.json
+++ b/unittests/InstructionCountCI/AFP/Secondary_REPNE.json
@ -136,7 +136,7 @@
        "0xf2 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq d16, d16, d17"
+        "fcmeq d16, d17, d16"
      ]
    },
    "cmpsd xmm0, xmm1, 1": {
@ -176,7 +176,7 @@
        "0xf2 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq d0, d16, d17",
+        "fcmeq d0, d17, d16",
        "mvn v0.8b, v0.8b",
        "mov v16.d[0], v0.d[0]"
      ]
--- a/unittests/InstructionCountCI/AFP/VEX_map1.json
+++ b/unittests/InstructionCountCI/AFP/VEX_map1.json
@ -30,7 +30,7 @@
      ]
    },
    "vrsqrtss xmm0, xmm1, xmm2": {
-      "ExpectedInstructionCount": 4,
+      "ExpectedInstructionCount": 5,
      "Comment": [
        "FEAT_FPRES could make this more optimal",
        "Map 1 0b10 0x52 128-bit"
@ -39,7 +39,8 @@
        "mov v16.16b, v17.16b",
        "fmov s0, #0x70 (1.0000)",
        "fsqrt s1, s18",
-        "fdiv s16, s0, s1"
+        "fdiv s0, s0, s1",
+        "mov v16.s[0], v0.s[0]"
      ]
    },
    "vrcpss xmm0, xmm1, xmm2": {
@ -61,7 +62,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq s16, s17, s18"
+        "fcmeq s16, s18, s17"
      ]
    },
    "vcmpss xmm0, xmm1, xmm2, 0x01": {
@ -105,7 +106,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq s0, s17, s18",
+        "fcmeq s0, s18, s17",
        "mvn v0.8b, v0.8b",
        "mov v16.s[0], v0.s[0]"
      ]
@ -154,7 +155,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq d16, d17, d18"
+        "fcmeq d16, d18, d17"
      ]
    },
    "vcmpsd xmm0, xmm1, xmm2, 0x01": {
@ -198,7 +199,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq d0, d17, d18",
+        "fcmeq d0, d18, d17",
        "mvn v0.8b, v0.8b",
        "mov v16.d[0], v0.d[0]"
      ]
--- a/unittests/InstructionCountCI/Secondary_REP.json
+++ b/unittests/InstructionCountCI/Secondary_REP.json
@ -525,7 +525,7 @@
        "0xf3 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq s0, s16, s17",
+        "fcmeq s0, s17, s16",
        "mov v16.s[0], v0.s[0]"
      ]
    },
@ -568,7 +568,7 @@
        "0xf3 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq s0, s16, s17",
+        "fcmeq s0, s17, s16",
        "mvn v0.8b, v0.8b",
        "mov v16.s[0], v0.s[0]"
      ]
--- a/unittests/InstructionCountCI/Secondary_REPNE.json
+++ b/unittests/InstructionCountCI/Secondary_REPNE.json
@ -366,7 +366,7 @@
        "0xf2 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq d0, d16, d17",
+        "fcmeq d0, d17, d16",
        "mov v16.d[0], v0.d[0]"
      ]
    },
@ -409,7 +409,7 @@
        "0xf2 0x0f 0xc2"
      ],
      "ExpectedArm64ASM": [
-        "fcmeq d0, d16, d17",
+        "fcmeq d0, d17, d16",
        "mvn v0.8b, v0.8b",
        "mov v16.d[0], v0.d[0]"
      ]
--- a/unittests/InstructionCountCI/VEX_map1.json
+++ b/unittests/InstructionCountCI/VEX_map1.json
@ -2394,7 +2394,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq s0, s17, s18",
+        "fcmeq s0, s18, s17",
        "mov v16.s[0], v0.s[0]"
      ]
    },
@ -2441,7 +2441,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq s0, s17, s18",
+        "fcmeq s0, s18, s17",
        "mvn v0.8b, v0.8b",
        "mov v16.s[0], v0.s[0]"
      ]
@ -2490,7 +2490,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq d0, d17, d18",
+        "fcmeq d0, d18, d17",
        "mov v16.d[0], v0.d[0]"
      ]
    },
@ -2537,7 +2537,7 @@
      ],
      "ExpectedArm64ASM": [
        "mov v16.16b, v17.16b",
-        "fcmeq d0, d17, d18",
+        "fcmeq d0, d18, d17",
        "mvn v0.8b, v0.8b",
        "mov v16.d[0], v0.d[0]"
      ]