OpcodeDispatcher: Implement AVX gathers with SVE256

Just to ensure we still have feature parity.
2024-12-14 09:28:34 +00:00 · 2024-06-22 06:46:55 -07:00 · 2024-06-22 06:46:55 -07:00 · a4fa3a460e
commit a4fa3a460e
parent 77ba708933
3 changed files with 114 additions and 0 deletions
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@ -5294,6 +5294,11 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
    {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp<false>},
    {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp<true>},

+    {OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
+    {OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
+    {OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
+    {OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
+
    {OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::VFMADDSUB<1, 3, 2>},
    {OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::VFMSUBADD<1, 3, 2>},

--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@ -982,6 +982,10 @@ public:
    uint8_t Scale;
  };

+  RefVSIB LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags);
+  template<size_t AddrElementSize>
+  void VPGATHER(OpcodeArgs);
+
  template<size_t ElementSize, size_t DstElementSize, bool Signed>
  void ExtendVectorElements(OpcodeArgs);
  template<size_t ElementSize>
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -5106,4 +5106,109 @@ template void OpDispatchBuilder::VFMSUBADD<1, 3, 2>(OpcodeArgs);
 template void OpDispatchBuilder::VFMSUBADD<2, 1, 3>(OpcodeArgs);
 template void OpDispatchBuilder::VFMSUBADD<2, 3, 1>(OpcodeArgs);

+OpDispatchBuilder::RefVSIB OpDispatchBuilder::LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags) {
+  const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
+  LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
+
+  // VSIB is a very special case which has a ton of encoded data.
+  // Get it in a format we can reason about.
+
+  const auto Index_gpr = Operand.Data.SIB.Index;
+  const auto Base_gpr = Operand.Data.SIB.Base;
+  LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
+  LOGMAN_THROW_AA_FMT(
+    Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
+    "Base must be a GPR.");
+  const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
+
+  return {
+    .Low = LoadXMMRegister(Index_XMM_gpr),
+    .BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
+    .Displacement = Operand.Data.SIB.Offset,
+    .Scale = Operand.Data.SIB.Scale,
+  };
+}
+
+template<size_t AddrElementSize>
+void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
+  LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
+
+  const auto Size = GetDstSize(Op);
+  const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  ///< Element size is determined by W flag.
+  const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
+
+  auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
+
+  const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
+
+  Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  Ref Result {};
+  if (!SupportsSVELoad) {
+    // We need to go down the fallback path in the case that we don't hit the backend's SVE mode.
+    RefPair Dest128 {
+      .Low = Dest,
+      .High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Dest, 1),
+    };
+
+    RefPair Mask128 {
+      .Low = Mask,
+      .High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Mask, 1),
+    };
+
+    RefVSIB VSIB128 = VSIB;
+    if (Is128Bit) {
+      ///< A bit careful for the VSIB index register duplicating.
+      VSIB128.High = VSIB128.Low;
+    } else {
+      VSIB128.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, VSIB128.Low, 1);
+    }
+
+    auto Result128 = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest128, Mask128, VSIB128);
+    // The registers are current split, need to merge them.
+    Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
+  } else {
+    ///< Calculate the full operation.
+    ///< BaseAddr doesn't need to exist, calculate that here.
+    Ref BaseAddr = VSIB.BaseAddr;
+    if (BaseAddr && VSIB.Displacement) {
+      BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
+    } else if (VSIB.Displacement) {
+      BaseAddr = _Constant(VSIB.Displacement);
+    } else if (!BaseAddr) {
+      BaseAddr = Invalid();
+    }
+
+    Result = _VLoadVectorGatherMasked(Size, ElementLoadSize, Dest, Mask, BaseAddr, VSIB.Low, Invalid(), AddrElementSize, VSIB.Scale, 0, 0);
+  }
+
+  if (Is128Bit) {
+    if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      // Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
+      // Only loads two 32-bit elements in to the lower 64-bits of the first destination.
+      // Bits [255:65] all become zero.
+      Result = _VMov(OpSize::i64Bit, Result);
+    } else if (Is128Bit) {
+      Result = _VMov(OpSize::i128Bit, Result);
+    }
+  } else {
+    if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      // If we only fetched 128-bits worth of data then the upper-result is all zero.
+      Result = _VMov(OpSize::i128Bit, Result);
+    }
+  }
+
+  StoreResult(FPRClass, Op, Result, -1);
+
+  ///< Assume non-faulting behaviour and clear the mask register.
+  auto Zero = LoadZeroVector(Size);
+  StoreResult_WithOpSize(FPRClass, Op, Op->Src[1], Zero, Size, -1);
+}
+
+template void OpDispatchBuilder::VPGATHER<4>(OpcodeArgs);
+template void OpDispatchBuilder::VPGATHER<8>(OpcodeArgs);
+
 } // namespace FEXCore::IR