Merge pull request #3757 from Sonicadvance1/avx_16

AVX128: Implement support for gathers
2025-01-18 20:35:03 +00:00 · 2024-06-26 13:29:58 -07:00 · 2024-06-26 13:29:58 -07:00 · 6226c7f4f3
commit 6226c7f4f3
parent d1d41f5645 991ecd558e
8 changed files with 2217 additions and 135 deletions
--- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
@ -940,6 +940,187 @@ DEF_OP(VStoreVectorMasked) {
  }
 }

+DEF_OP(VLoadVectorGatherMasked) {
+  const auto Op = IROp->C<IR::IROp_VLoadVectorGatherMasked>();
+  const auto OpSize = IROp->Size;
+
+  const auto VectorIndexSize = Op->VectorIndexElementSize;
+  const auto OffsetScale = Op->OffsetScale;
+  const auto DataElementOffsetStart = Op->DataElementOffsetStart;
+  const auto IndexElementOffsetStart = Op->IndexElementOffsetStart;
+
+  ///< This IR operation handles discontiguous masked gather loadstore instructions. Some things to note about its behaviour.
+  ///  - VSIB behaviour is mostly entirely exposed in the IR operation directly.
+  ///    - Displacement is the only value missing as that can be added directly to AddrBase.
+  ///  - VectorIndex{Low,High} contains the index offsets for each element getting loaded.
+  ///     - These element sizes are decoupled from the resulting element size. These can be 32-bit or 64-bit.
+  ///     - When the element size is 32-bit then the value is zero-extended to the full 64-bit address calculation
+  ///     - When loading a 128-bit result with 64-bit VectorIndex Elements, this requires the use of both VectorIndexLow and VectorIndexHigh
+  ///     to get enough pointers.
+  ///  - When VectorIndexElementSize and OffsetScale matches Arm64 SVE behaviour then the operation becomes more optimal
+  ///     - When the behaviour doesn't match then it gets decomposed to ASIMD style masked load.
+  ///  - AddrBase also doesn't need to exist
+  ///     - If the instruction is using 64-bit vector indexing or 32-bit addresses where the top-bit isn't set then this is valid!
+  const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
+  if (Is256Bit) {
+    LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use VStoreVectorMasked with 256-bit operation");
+  }
+
+  const auto Dst = GetVReg(Node);
+  const auto IncomingDst = GetVReg(Op->Incoming.ID());
+
+  const auto MaskReg = GetVReg(Op->Mask.ID());
+  std::optional<ARMEmitter::Register> BaseAddr = !Op->AddrBase.IsInvalid() ? std::make_optional(GetReg(Op->AddrBase.ID())) : std::nullopt;
+  const auto VectorIndexLow = GetVReg(Op->VectorIndexLow.ID());
+  std::optional<ARMEmitter::VRegister> VectorIndexHigh =
+    !Op->VectorIndexHigh.IsInvalid() ? std::make_optional(GetVReg(Op->VectorIndexHigh.ID())) : std::nullopt;
+
+  ///< If the host supports SVE and the offset scale matches SVE limitations then it can do an SVE style load.
+  const bool SupportsSVELoad = (HostSupportsSVE128 || HostSupportsSVE256) && (OffsetScale == 1 || OffsetScale == VectorIndexSize) &&
+                               (VectorIndexSize == IROp->ElementSize);
+
+  const auto PerformSMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) {
+    switch (ElementSize) {
+    case 1: smov<ARMEmitter::SubRegSize::i8Bit>(Dst.X(), Vector, index); break;
+    case 2: smov<ARMEmitter::SubRegSize::i16Bit>(Dst.X(), Vector, index); break;
+    case 4: smov<ARMEmitter::SubRegSize::i32Bit>(Dst.X(), Vector, index); break;
+    case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst.X(), Vector, index); break;
+    default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break;
+    }
+  };
+
+  const auto PerformMove = [this](size_t ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) {
+    switch (ElementSize) {
+    case 1: umov<ARMEmitter::SubRegSize::i8Bit>(Dst, Vector, index); break;
+    case 2: umov<ARMEmitter::SubRegSize::i16Bit>(Dst, Vector, index); break;
+    case 4: umov<ARMEmitter::SubRegSize::i32Bit>(Dst, Vector, index); break;
+    case 8: umov<ARMEmitter::SubRegSize::i64Bit>(Dst, Vector, index); break;
+    default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break;
+    }
+  };
+
+  if (SupportsSVELoad) {
+    ARMEmitter::SVEModType ModType = ARMEmitter::SVEModType::MOD_NONE;
+    uint8_t SVEScale = FEXCore::ilog2(OffsetScale);
+    if (VectorIndexSize == 4) {
+      ModType = ARMEmitter::SVEModType::MOD_SXTW;
+    } else if (VectorIndexSize == 8 && OffsetScale != 1) {
+      ModType = ARMEmitter::SVEModType::MOD_LSL;
+    }
+
+    ARMEmitter::Register AddrReg = TMP1;
+
+    if (BaseAddr.has_value()) {
+      AddrReg = GetReg(Op->AddrBase.ID());
+    } else {
+      ///< OpcodeDispatcher didn't provide a Base address while SVE requires one.
+      LoadConstant(ARMEmitter::Size::i64Bit, AddrReg, 0);
+    }
+
+    const auto MemDst = ARMEmitter::SVEMemOperand(AddrReg.X(), VectorIndexLow.Z(), ModType, SVEScale);
+    const auto SubRegSize = ConvertSubRegSize8(IROp);
+
+    const auto CMPPredicate = ARMEmitter::PReg::p0;
+    const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B : PRED_TMP_16B;
+
+    // Check if the sign bit is set for the given element size.
+    cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0);
+    auto TempDst = VTMP1;
+
+    switch (IROp->ElementSize) {
+    case 1: {
+      ld1b<ARMEmitter::SubRegSize::i8Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
+      break;
+    }
+    case 2: {
+      ld1h<ARMEmitter::SubRegSize::i16Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
+      break;
+    }
+    case 4: {
+      ld1w<ARMEmitter::SubRegSize::i32Bit>(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
+      break;
+    }
+    case 8: {
+      ld1d(TempDst.Z(), CMPPredicate.Zeroing(), MemDst);
+      break;
+    }
+    default: break;
+    }
+
+    ///< Merge elements based on predicate.
+    sel(SubRegSize, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z());
+  } else {
+    LOGMAN_THROW_A_FMT(!Is256Bit, "Can't emulate this gather load in the backend! Programming error!");
+    ///< Adventurers beware, emulated ASIMD style gather masked load operation.
+    // Number of elements to load is calculated by the number of index elements available.
+    size_t NumAddrElements = (VectorIndexHigh.has_value() ? 32 : 16) / VectorIndexSize;
+    // The number of elements is clamped by the resulting register size.
+    size_t NumDataElements = std::min<size_t>(IROp->Size / IROp->ElementSize, NumAddrElements);
+
+    size_t IndexElementsSizeBytes = NumAddrElements * VectorIndexSize;
+    if (IndexElementsSizeBytes > 16) {
+      // We must have a high register in this case.
+      LOGMAN_THROW_A_FMT(VectorIndexHigh.has_value(), "Need High vector index register!");
+    }
+
+    // Use VTMP1 as the temporary destination
+    auto TempReg = VTMP1;
+    auto WorkingReg = TMP1;
+    auto TempMemReg = TMP2;
+    const uint64_t ElementSizeInBits = IROp->ElementSize * 8;
+
+    mov(TempReg.Q(), IncomingDst.Q());
+    for (size_t i = DataElementOffsetStart, IndexElement = IndexElementOffsetStart; i < NumDataElements; ++i, ++IndexElement) {
+      ARMEmitter::SingleUseForwardLabel Skip {};
+      // Extract mask element
+      PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i);
+
+      // Skip if the mask's sign bit isn't set
+      tbz(WorkingReg, ElementSizeInBits - 1, &Skip);
+
+      // Extract Index Element
+      if ((IndexElement * VectorIndexSize) >= 16) {
+        // Fetch from the high index register.
+        PerformSMove(VectorIndexSize, WorkingReg, *VectorIndexHigh, IndexElement - (16 / VectorIndexSize));
+      } else {
+        // Fetch from the low index register.
+        PerformSMove(VectorIndexSize, WorkingReg, VectorIndexLow, IndexElement);
+      }
+
+      // Calculate memory position for this gather load
+      if (BaseAddr.has_value()) {
+        if (VectorIndexSize == 4) {
+          add(ARMEmitter::Size::i64Bit, TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ExtendedType::SXTW, FEXCore::ilog2(OffsetScale));
+        } else {
+          add(ARMEmitter::Size::i64Bit, TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(OffsetScale));
+        }
+      } else {
+        ///< In this case we have no base address, All addresses come from the vector register itself
+        if (VectorIndexSize == 4) {
+          // Sign extend and shift in to the 64-bit register
+          sbfiz(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale), 32);
+        } else {
+          lsl(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale));
+        }
+      }
+
+      // Now that the address is calculated. Do the load.
+      switch (IROp->ElementSize) {
+      case 1: ld1<ARMEmitter::SubRegSize::i8Bit>(TempReg.Q(), i, TempMemReg); break;
+      case 2: ld1<ARMEmitter::SubRegSize::i16Bit>(TempReg.Q(), i, TempMemReg); break;
+      case 4: ld1<ARMEmitter::SubRegSize::i32Bit>(TempReg.Q(), i, TempMemReg); break;
+      case 8: ld1<ARMEmitter::SubRegSize::i64Bit>(TempReg.Q(), i, TempMemReg); break;
+      case 16: ldr(TempReg.Q(), TempMemReg, 0); break;
+      default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); FEX_UNREACHABLE;
+      }
+
+      Bind(&Skip);
+    }
+    // Move result.
+    mov(Dst.Q(), TempReg.Q());
+  }
+}
+
 DEF_OP(VLoadVectorElement) {
  const auto Op = IROp->C<IR::IROp_VLoadVectorElement>();
  const auto OpSize = IROp->Size;
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@ -5294,6 +5294,11 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
    {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp<false>},
    {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp<true>},

+    {OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
+    {OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
+    {OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::VPGATHER<OpSize::i32Bit>},
+    {OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::VPGATHER<OpSize::i64Bit>},
+
    {OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::VFMADDSUB<1, 3, 2>},
    {OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::VFMSUBADD<1, 3, 2>},

--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@ -975,6 +975,17 @@ public:
  template<uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx>
  void VFMSUBADD(OpcodeArgs);

+  struct RefVSIB {
+    Ref Low, High;
+    Ref BaseAddr;
+    int32_t Displacement;
+    uint8_t Scale;
+  };
+
+  RefVSIB LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags);
+  template<size_t AddrElementSize>
+  void VPGATHER(OpcodeArgs);
+
  template<size_t ElementSize, size_t DstElementSize, bool Signed>
  void ExtendVectorElements(OpcodeArgs);
  template<size_t ElementSize>
@ -1019,6 +1030,7 @@ public:
  RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags,
                                       bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT);

+  RefVSIB AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh);
  void AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand, const RefPair Src,
                                     MemoryAccessType AccessType = MemoryAccessType::DEFAULT);
  void InstallAVX128Handlers();
@ -1247,6 +1259,12 @@ public:
  template<uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx>
  void AVX128_VFMSUBADD(OpcodeArgs);

+  template<size_t AddrElementSize>
+  RefPair AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, RefPair Dest, RefPair Mask, RefVSIB VSIB);
+
+  template<size_t AddrElementSize>
+  void AVX128_VPGATHER(OpcodeArgs);
+
  // End of AVX 128-bit implementation
  void InvalidOp(OpcodeArgs);

--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
@ -327,6 +327,11 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::AVX128_VPMASKMOV<false>},
    {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::AVX128_VPMASKMOV<true>},

+    {OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i32Bit>},
+    {OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i64Bit>},
+    {OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i32Bit>},
+    {OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::AVX128_VPGATHER<OpSize::i64Bit>},
+
    {OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::AVX128_VFMADDSUB<1, 3, 2>},
    {OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::AVX128_VFMSUBADD<1, 3, 2>},

@ -486,10 +491,9 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
    AddressMode HighA = A;
    HighA.Offset += 16;

-    ///< TODO: Implement VSIB once we get there.
    if (Operand.IsSIB()) {
      const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
-      LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB currently unsupported");
+      LOGMAN_THROW_AA_FMT(!IsVSIB, "VSIB uses LoadVSIB instead");
    }

    return {
@ -499,6 +503,31 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize(
  }
 }

+OpDispatchBuilder::RefVSIB
+OpDispatchBuilder::AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh) {
+  const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
+  LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
+
+  // VSIB is a very special case which has a ton of encoded data.
+  // Get it in a format we can reason about.
+
+  const auto Index_gpr = Operand.Data.SIB.Index;
+  const auto Base_gpr = Operand.Data.SIB.Base;
+  LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
+  LOGMAN_THROW_AA_FMT(
+    Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
+    "Base must be a GPR.");
+  const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
+
+  return {
+    .Low = AVX128_LoadXMMRegister(Index_XMM_gpr, false),
+    .High = NeedsHigh ? AVX128_LoadXMMRegister(Index_XMM_gpr, true) : Invalid(),
+    .BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
+    .Displacement = Operand.Data.SIB.Offset,
+    .Scale = Operand.Data.SIB.Scale,
+  };
+}
+
 void OpDispatchBuilder::AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand,
                                                      const RefPair Src, MemoryAccessType AccessType) {
  if (Operand.IsGPR()) {
@ -2007,7 +2036,6 @@ void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, size_t ElementSize, size
    return MakeSegmentAddress(Op, Data, CTX->GetGPRSize());
  };

-  ///< TODO: Needs SVE for masked loadstores.
  if (IsStore) {
    auto Address = MakeAddress(Op->Dest);

@ -2487,4 +2515,96 @@ void OpDispatchBuilder::AVX128_VFMSUBADD(OpcodeArgs) {
  AVX128_VFMAddSubImpl(Op, false, Src1Idx, Src2Idx, AddendIdx);
 }

+template<size_t AddrElementSize>
+OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, OpSize ElementLoadSize, RefPair Dest, RefPair Mask, RefVSIB VSIB) {
+  LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
+  const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  ///< BaseAddr doesn't need to exist, calculate that here.
+  Ref BaseAddr = VSIB.BaseAddr;
+  if (BaseAddr && VSIB.Displacement) {
+    BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
+  } else if (VSIB.Displacement) {
+    BaseAddr = _Constant(VSIB.Displacement);
+  } else if (!BaseAddr) {
+    BaseAddr = Invalid();
+  }
+
+  RefPair Result {};
+  ///< Calculate the low-half.
+  Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High,
+                                        AddrElementSize, VSIB.Scale, 0, 0);
+
+  if (Is128Bit) {
+    Result.High = LoadZeroVector(OpSize::i128Bit);
+    if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      // Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
+      // Only loads two 32-bit elements in to the lower 64-bits of the first destination.
+      // Bits [255:65] all become zero.
+      Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High);
+    }
+  } else {
+    RefPair AddrAddressing {};
+
+    Ref DestReg = Dest.High;
+    Ref MaskReg = Mask.High;
+    uint8_t IndexElementOffset {};
+    uint8_t DataElementOffset {};
+    if (AddrElementSize == ElementLoadSize) {
+      // If the address size matches the loading element size then it will be fetching at the same rate between low and high
+      AddrAddressing.Low = VSIB.High;
+      AddrAddressing.High = Invalid();
+    } else if (AddrElementSize == OpSize::i32Bit && ElementLoadSize == OpSize::i64Bit) {
+      // If the address element size if half the size of the Element load size then we need to start fetching half-way through the low register.
+      AddrAddressing.Low = VSIB.Low;
+      AddrAddressing.High = VSIB.High;
+      IndexElementOffset = OpSize::i128Bit / AddrElementSize / 2;
+    } else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      AddrAddressing.Low = VSIB.High;
+      AddrAddressing.High = Invalid();
+      DestReg = Result.Low; ///< Start mixing with the low register.
+      MaskReg = Mask.Low;   ///< Mask starts with the low mask here.
+      IndexElementOffset = 0;
+      DataElementOffset = OpSize::i128Bit / ElementLoadSize / 2;
+    }
+
+    ///< Calculate the high-half.
+    auto ResultHigh = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, DestReg, MaskReg, BaseAddr, AddrAddressing.Low,
+                                               AddrAddressing.High, AddrElementSize, VSIB.Scale, DataElementOffset, IndexElementOffset);
+
+    if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      // If we only fetched 128-bits worth of data then the upper-result is all zero.
+      Result = AVX128_Zext(ResultHigh);
+    } else {
+      Result.High = ResultHigh;
+    }
+  }
+
+  return Result;
+}
+
+template<size_t AddrElementSize>
+void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
+
+  const auto Size = GetDstSize(Op);
+  const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  ///< Element size is determined by W flag.
+  const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
+
+  auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
+  auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, !Is128Bit);
+  auto Mask = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
+
+  RefPair Result {};
+  Result = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest, Mask, VSIB);
+  AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
+
+  ///< Assume non-faulting behaviour and clear the mask register.
+  RefPair ZeroPair {};
+  ZeroPair.Low = LoadZeroVector(OpSize::i128Bit);
+  ZeroPair.High = ZeroPair.Low;
+  AVX128_StoreResult_WithOpSize(Op, Op->Src[1], ZeroPair);
+}
+
 } // namespace FEXCore::IR
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -5106,4 +5106,109 @@ template void OpDispatchBuilder::VFMSUBADD<1, 3, 2>(OpcodeArgs);
 template void OpDispatchBuilder::VFMSUBADD<2, 1, 3>(OpcodeArgs);
 template void OpDispatchBuilder::VFMSUBADD<2, 3, 1>(OpcodeArgs);

+OpDispatchBuilder::RefVSIB OpDispatchBuilder::LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags) {
+  const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0;
+  LOGMAN_THROW_A_FMT(Operand.IsSIB() && IsVSIB, "Trying to load VSIB for something that isn't the correct type!");
+
+  // VSIB is a very special case which has a ton of encoded data.
+  // Get it in a format we can reason about.
+
+  const auto Index_gpr = Operand.Data.SIB.Index;
+  const auto Base_gpr = Operand.Data.SIB.Base;
+  LOGMAN_THROW_AA_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg");
+  LOGMAN_THROW_AA_FMT(
+    Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15),
+    "Base must be a GPR.");
+  const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0;
+
+  return {
+    .Low = LoadXMMRegister(Index_XMM_gpr),
+    .BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr,
+    .Displacement = Operand.Data.SIB.Offset,
+    .Scale = Operand.Data.SIB.Scale,
+  };
+}
+
+template<size_t AddrElementSize>
+void OpDispatchBuilder::VPGATHER(OpcodeArgs) {
+  LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size");
+
+  const auto Size = GetDstSize(Op);
+  const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  ///< Element size is determined by W flag.
+  const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
+
+  auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags);
+
+  const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == AddrElementSize) && (AddrElementSize == ElementLoadSize);
+
+  Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Mask = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  Ref Result {};
+  if (!SupportsSVELoad) {
+    // We need to go down the fallback path in the case that we don't hit the backend's SVE mode.
+    RefPair Dest128 {
+      .Low = Dest,
+      .High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Dest, 1),
+    };
+
+    RefPair Mask128 {
+      .Low = Mask,
+      .High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Mask, 1),
+    };
+
+    RefVSIB VSIB128 = VSIB;
+    if (Is128Bit) {
+      ///< A bit careful for the VSIB index register duplicating.
+      VSIB128.High = VSIB128.Low;
+    } else {
+      VSIB128.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, VSIB128.Low, 1);
+    }
+
+    auto Result128 = AVX128_VPGatherImpl<AddrElementSize>(SizeToOpSize(Size), ElementLoadSize, Dest128, Mask128, VSIB128);
+    // The registers are current split, need to merge them.
+    Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High);
+  } else {
+    ///< Calculate the full operation.
+    ///< BaseAddr doesn't need to exist, calculate that here.
+    Ref BaseAddr = VSIB.BaseAddr;
+    if (BaseAddr && VSIB.Displacement) {
+      BaseAddr = _Add(OpSize::i64Bit, BaseAddr, _Constant(VSIB.Displacement));
+    } else if (VSIB.Displacement) {
+      BaseAddr = _Constant(VSIB.Displacement);
+    } else if (!BaseAddr) {
+      BaseAddr = Invalid();
+    }
+
+    Result = _VLoadVectorGatherMasked(Size, ElementLoadSize, Dest, Mask, BaseAddr, VSIB.Low, Invalid(), AddrElementSize, VSIB.Scale, 0, 0);
+  }
+
+  if (Is128Bit) {
+    if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      // Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results.
+      // Only loads two 32-bit elements in to the lower 64-bits of the first destination.
+      // Bits [255:65] all become zero.
+      Result = _VMov(OpSize::i64Bit, Result);
+    } else if (Is128Bit) {
+      Result = _VMov(OpSize::i128Bit, Result);
+    }
+  } else {
+    if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
+      // If we only fetched 128-bits worth of data then the upper-result is all zero.
+      Result = _VMov(OpSize::i128Bit, Result);
+    }
+  }
+
+  StoreResult(FPRClass, Op, Result, -1);
+
+  ///< Assume non-faulting behaviour and clear the mask register.
+  auto Zero = LoadZeroVector(Size);
+  StoreResult_WithOpSize(FPRClass, Op, Op->Src[1], Zero, Size, -1);
+}
+
+template void OpDispatchBuilder::VPGATHER<4>(OpcodeArgs);
+template void OpDispatchBuilder::VPGATHER<8>(OpcodeArgs);
+
 } // namespace FEXCore::IR
--- a/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp
+++ b/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp
@ -343,10 +343,10 @@ std::array<X86InstInfo, MAX_VEX_TABLE_SIZE> VEXTableOps = []() consteval {
    {OPD(2, 0b01, 0x8C), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
    {OPD(2, 0b01, 0x8E), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},

-    {OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
-    {OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
-    {OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
-    {OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_UNDEC, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
+    {OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
+    {OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
+    {OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},
+    {OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0, nullptr}},

    {OPD(2, 0b01, 0x96), 1, X86InstInfo{"VFMADDSUB132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
    {OPD(2, 0b01, 0x97), 1, X86InstInfo{"VFMSUBADD132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
--- a/FEXCore/Source/Interface/IR/IR.json
+++ b/FEXCore/Source/Interface/IR/IR.json
@ -545,6 +545,19 @@
        "DestSize": "RegisterSize",
        "NumElements": "RegisterSize / ElementSize"
      },
+      "FPR = VLoadVectorGatherMasked u8:#RegisterSize, u8:#ElementSize, FPR:$Incoming, FPR:$Mask, GPR:$AddrBase, FPR:$VectorIndexLow, FPR:$VectorIndexHigh, u8:$VectorIndexElementSize, u8:$OffsetScale, u8:$DataElementOffsetStart, u8:$IndexElementOffsetStart": {
+        "Desc": [
+          "Does a masked load similar to VPGATHERD* where the upper bit of each element",
+          "determines whether or not that element will be loaded from memory.",
+          "Most of VSIB encoding is passed directly through to the IR operation."
+        ],
+        "ImplicitFlagClobber": true,
+        "DestSize": "RegisterSize",
+        "NumElements": "RegisterSize / ElementSize",
+        "EmitValidation": [
+          "$VectorIndexElementSize == OpSize::i32Bit || $VectorIndexElementSize == OpSize::i64Bit"
+        ]
+      },
      "FPR = VLoadVectorElement u8:#RegisterSize, u8:#ElementSize, FPR:$DstSrc, u8:$Index, GPR:$Addr": {
        "Desc": ["Does a memory load to a single element of a vector.",
                 "Leaves the rest of the vector's data intact.",
--- a/unittests/InstructionCountCI/VEX_map2.json
+++ b/unittests/InstructionCountCI/VEX_map2.json