Merge pull request #3730 from Sonicadvance1/avx_4

Vector: Helper refactorings
2025-02-08 23:57:05 +00:00 · 2024-06-21 00:31:14 -07:00 · 2024-06-21 00:31:14 -07:00 · 3f232e631e
commit 3f232e631e
parent 6e3643c3ef e5a531e683
2 changed files with 139 additions and 108 deletions
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@ -609,6 +609,7 @@ public:

  void VANDNOp(OpcodeArgs);

+  Ref VBLENDOpImpl(uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector);
  void VBLENDPDOp(OpcodeArgs);
  void VPBLENDDOp(OpcodeArgs);
  void VPBLENDWOp(OpcodeArgs);
@ -1112,8 +1113,7 @@ private:
  Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
                     const X86Tables::DecodedOperand& Imm);

-  Ref MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
-                    const X86Tables::DecodedOperand& ImmOp);
+  Ref MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select);

  Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
                    const X86Tables::DecodedOperand& Imm, bool IsAVX);
@ -1131,9 +1131,9 @@ private:
  Ref PINSROpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
                  const X86Tables::DecodedOperand& Imm);

-  Ref PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2);
+  Ref PMADDWDOpImpl(size_t Size, Ref Src1, Ref Src2);

-  Ref PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);
+  Ref PMADDUBSWOpImpl(size_t Size, Ref Src1, Ref Src2);

  Ref PMULHRSWOpImpl(OpcodeArgs, Ref Src1, Ref Src2);

@ -1141,9 +1141,9 @@ private:

  Ref PMULLOpImpl(OpcodeArgs, size_t ElementSize, bool Signed, Ref Src1, Ref Src2);

-  Ref PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);
+  Ref PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2);

-  Ref PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2);
+  Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2);

  Ref PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2);

@ -1155,8 +1155,7 @@ private:

  Ref PSRLDOpImpl(OpcodeArgs, size_t ElementSize, Ref Src, Ref ShiftVec);

-  Ref SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
-                 const X86Tables::DecodedOperand& Imm);
+  Ref SHUFOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle);

  void VMASKMOVOpImpl(OpcodeArgs, size_t ElementSize, size_t DataSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp,
                      const X86Tables::DecodedOperand& DataOp);
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -1004,11 +1004,7 @@ template void OpDispatchBuilder::VPUNPCKHOp<2>(OpcodeArgs);
 template void OpDispatchBuilder::VPUNPCKHOp<4>(OpcodeArgs);
 template void OpDispatchBuilder::VPUNPCKHOp<8>(OpcodeArgs);

-Ref OpDispatchBuilder::PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2) {
-  Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
-  Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
-
-  const auto SrcSize = GetSrcSize(Op);
+Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
  const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;

  // We perform the 256-bit version as two 128-bit operations due to
@ -1026,25 +1022,33 @@ Ref OpDispatchBuilder::PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
  const uint8_t MaskImm = SrcSize == 8 ? 0b1000'0111 : 0b1000'1111;

  Ref MaskVector = _VectorImm(SrcSize, 1, MaskImm);
-  Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2Node, MaskVector);
+  Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector);

-  Ref Low = _VTBL1(SanitizedSrcSize, Src1Node, MaskedIndices);
+  Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices);
  if (!Is256Bit) {
    return Low;
  }

-  Ref HighSrc1 = _VInsElement(SrcSize, 16, 0, 1, Src1Node, Src1Node);
+  Ref HighSrc1 = _VInsElement(SrcSize, 16, 0, 1, Src1, Src1);
  Ref High = _VTBL1(SanitizedSrcSize, HighSrc1, MaskedIndices);
  return _VInsElement(SrcSize, 16, 1, 0, Low, High);
 }

 void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) {
-  Ref Result = PSHUFBOpImpl(Op, Op->Dest, Op->Src[0]);
+  const auto SrcSize = GetSrcSize(Op);
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

 void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) {
-  Ref Result = PSHUFBOpImpl(Op, Op->Src[0], Op->Src[1]);
+  const auto SrcSize = GetSrcSize(Op);
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

@ -1259,32 +1263,24 @@ template void OpDispatchBuilder::VPSHUFWOp<2, false>(OpcodeArgs);
 template void OpDispatchBuilder::VPSHUFWOp<2, true>(OpcodeArgs);
 template void OpDispatchBuilder::VPSHUFWOp<4, true>(OpcodeArgs);

-Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1,
-                                  const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm) {
-  Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
-  Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
-
-  LOGMAN_THROW_A_FMT(Imm.IsLiteral(), "Imm needs to be a literal");
-  uint8_t Shuffle = Imm.Data.Literal.Value;
-
+Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle) {
  // Since 256-bit variants and up don't lane cross, we can construct
  // everything in terms of the 128-variant, as each lane is essentially
  // its own 128-bit segment.
  const uint8_t NumElements = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
  const uint8_t HalfNumElements = NumElements >> 1;

-  const uint8_t DstSize = GetDstSize(Op);
  const bool Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;

  std::array<Ref, 4> Srcs {};
  for (size_t i = 0; i < HalfNumElements; ++i) {
-    Srcs[i] = Src1Node;
+    Srcs[i] = Src1;
  }
  for (size_t i = HalfNumElements; i < NumElements; ++i) {
-    Srcs[i] = Src2Node;
+    Srcs[i] = Src2;
  }

-  Ref Dest = Src1Node;
+  Ref Dest = Src1;
  const uint8_t SelectionMask = NumElements - 1;
  const uint8_t ShiftAmount = std::popcount(SelectionMask);

@ -1311,96 +1307,96 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
        // Combining of low 64-bits.
        // Dest[63:0]   = Src1[63:0]
        // Dest[127:64] = Src2[63:0]
-        return _VZip(DstSize, 8, Src1Node, Src2Node);
+        return _VZip(DstSize, 8, Src1, Src2);
      case 0b11'10'11'10:
        // Combining of high 64-bits.
        // Dest[63:0]   = Src1[127:64]
        // Dest[127:64] = Src2[127:64]
-        return _VZip2(DstSize, 8, Src1Node, Src2Node);
+        return _VZip2(DstSize, 8, Src1, Src2);
      case 0b11'10'01'00:
        // Mixing Low and high elements
        // Dest[63:0]   = Src1[63:0]
        // Dest[127:64] = Src2[127:64]
-        return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
+        return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
      case 0b01'00'11'10:
        // Mixing Low and high elements, inverse of above
        // Dest[63:0]   = Src1[127:64]
        // Dest[127:64] = Src2[63:0]
-        return _VExtr(DstSize, 1, Src2Node, Src1Node, 8);
+        return _VExtr(DstSize, 1, Src2, Src1, 8);
      case 0b10'00'10'00:
        // Mixing even elements.
        // Dest[31:0]   = Src1[31:0]
        // Dest[63:32]  = Src1[95:64]
        // Dest[95:64]  = Src2[31:0]
        // Dest[127:96] = Src2[95:64]
-        return _VUnZip(DstSize, ElementSize, Src1Node, Src2Node);
+        return _VUnZip(DstSize, ElementSize, Src1, Src2);
      case 0b11'01'11'01:
        // Mixing odd elements.
        // Dest[31:0]   = Src1[63:32]
        // Dest[63:32]  = Src1[127:96]
        // Dest[95:64]  = Src2[63:32]
        // Dest[127:96] = Src2[127:96]
-        return _VUnZip2(DstSize, ElementSize, Src1Node, Src2Node);
+        return _VUnZip2(DstSize, ElementSize, Src1, Src2);
      case 0b11'10'00'00:
      case 0b11'10'01'01:
      case 0b11'10'10'10:
      case 0b11'10'11'11: {
        // Bottom elements duplicated, Top 64-bits inserted
-        auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
-        return _VZip2(DstSize, 8, DupSrc1, Src2Node);
+        auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
+        return _VZip2(DstSize, 8, DupSrc1, Src2);
      }
      case 0b01'00'00'00:
      case 0b01'00'01'01:
      case 0b01'00'10'10:
      case 0b01'00'11'11: {
        // Bottom elements duplicated, Bottom 64-bits inserted
-        auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
-        return _VZip(DstSize, 8, DupSrc1, Src2Node);
+        auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
+        return _VZip(DstSize, 8, DupSrc1, Src2);
      }
      case 0b00'00'01'00:
      case 0b01'01'01'00:
      case 0b10'10'01'00:
      case 0b11'11'01'00: {
        // Top elements duplicated, Bottom 64-bits inserted
-        auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
-        return _VZip(DstSize, 8, Src1Node, DupSrc2);
+        auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
+        return _VZip(DstSize, 8, Src1, DupSrc2);
      }
      case 0b00'00'11'10:
      case 0b01'01'11'10:
      case 0b10'10'11'10:
      case 0b11'11'11'10: {
        // Top elements duplicated, Top 64-bits inserted
-        auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
-        return _VZip2(DstSize, 8, Src1Node, DupSrc2);
+        auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
+        return _VZip2(DstSize, 8, Src1, DupSrc2);
      }
      case 0b01'00'01'11: {
        // TODO: This doesn't generate optimal code.
-        // RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
+        // RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
        // With RA fixes this would be 2 instructions.
        // Odd elements inverted, Low 64-bits inserted
-        Src1Node = _VInsElement(DstSize, 4, 0, 3, Src1Node, Src1Node);
-        return _VZip(DstSize, 8, Src1Node, Src2Node);
+        Src1 = _VInsElement(DstSize, 4, 0, 3, Src1, Src1);
+        return _VZip(DstSize, 8, Src1, Src2);
      }
      case 0b11'10'01'11: {
        // TODO: This doesn't generate optimal code.
-        // RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
+        // RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
        // With RA fixes this would be 2 instructions.
        // Odd elements inverted, Top 64-bits inserted
-        Src1Node = _VInsElement(DstSize, 4, 0, 3, Src1Node, Src1Node);
-        return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
+        Src1 = _VInsElement(DstSize, 4, 0, 3, Src1, Src1);
+        return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
      }
      case 0b01'00'00'01: {
        // Lower 32-bit elements inverted, low 64-bits inserted
-        Src1Node = _VRev64(DstSize, 4, Src1Node);
-        return _VZip(DstSize, 8, Src1Node, Src2Node);
+        Src1 = _VRev64(DstSize, 4, Src1);
+        return _VZip(DstSize, 8, Src1, Src2);
      }
      case 0b11'10'00'01: {
        // TODO: This doesn't generate optimal code.
-        // RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
+        // RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
        // With RA fixes this would be 2 instructions.
        // Lower 32-bit elements inverted, Top 64-bits inserted
-        Src1Node = _VRev64(DstSize, 4, Src1Node);
-        return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
+        Src1 = _VRev64(DstSize, 4, Src1);
+        return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
      }
      case 0b00'00'00'00:
      case 0b00'00'01'01:
@ -1419,8 +1415,8 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
      case 0b11'11'10'10:
      case 0b11'11'11'11: {
        // Duplicate element in upper and lower across each 64-bit segment.
-        auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
-        auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
+        auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
+        auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
        return _VZip(DstSize, 8, DupSrc1, DupSrc2);
      }
      default:
@ -1428,7 +1424,7 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
        if (CTX->BackendFeatures.SupportsVTBL2) {
          auto LookupIndexes =
            LoadAndCacheIndexedNamedVectorConstant(DstSize, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_SHUFPS, Shuffle * 16);
-          return _VTBL2(DstSize, Src1Node, Src2Node, LookupIndexes);
+          return _VTBL2(DstSize, Src1, Src2, LookupIndexes);
        }
        break;
      }
@ -1436,18 +1432,18 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
      switch (Shuffle & 0b11) {
      case 0b00:
        // Low 64-bits of each source interleaved.
-        return _VZip(DstSize, ElementSize, Src1Node, Src2Node);
+        return _VZip(DstSize, ElementSize, Src1, Src2);
      case 0b01:
        // Upper 64-bits of Src1 in lower bits
        // Lower 64-bits of Src2 in upper bits.
-        return _VExtr(DstSize, 1, Src2Node, Src1Node, 8);
+        return _VExtr(DstSize, 1, Src2, Src1, 8);
      case 0b10:
        // Lower 32-bits of Src1 in lower bits.
        // Upper 64-bits of Src2 in upper bits.
-        return _VInsElement(DstSize, ElementSize, 1, 1, Src1Node, Src2Node);
+        return _VInsElement(DstSize, ElementSize, 1, 1, Src1, Src2);
      case 0b11:
        // Upper 64-bits of each source interleaved.
-        return _VZip2(DstSize, ElementSize, Src1Node, Src2Node);
+        return _VZip2(DstSize, ElementSize, Src1, Src2);
      }
    }

@ -1463,7 +1459,13 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table

 template<size_t ElementSize>
 void OpDispatchBuilder::SHUFOp(OpcodeArgs) {
-  Ref Result = SHUFOpImpl(Op, ElementSize, Op->Dest, Op->Src[0], Op->Src[1]);
+  Ref Src1Node = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src2Node = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Imm needs to be a literal");
+  uint8_t Shuffle = Op->Src[1].Data.Literal.Value;
+
+  Ref Result = SHUFOpImpl(Op, GetDstSize(Op), ElementSize, Src1Node, Src2Node, Shuffle);
  StoreResult(FPRClass, Op, Result, -1);
 }
 template void OpDispatchBuilder::SHUFOp<4>(OpcodeArgs);
@ -1471,7 +1473,13 @@ template void OpDispatchBuilder::SHUFOp<8>(OpcodeArgs);

 template<size_t ElementSize>
 void OpDispatchBuilder::VSHUFOp(OpcodeArgs) {
-  Ref Result = SHUFOpImpl(Op, ElementSize, Op->Src[0], Op->Src[1], Op->Src[2]);
+  Ref Src1Node = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+  Ref Src2Node = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "Imm needs to be a literal");
+  uint8_t Shuffle = Op->Src[2].Data.Literal.Value;
+
+  Ref Result = SHUFOpImpl(Op, GetDstSize(Op), ElementSize, Src1Node, Src2Node, Shuffle);
  StoreResult(FPRClass, Op, Result, -1);
 }
 template void OpDispatchBuilder::VSHUFOp<4>(OpcodeArgs);
@ -3361,7 +3369,7 @@ template void OpDispatchBuilder::VPFCMPOp<0>(OpcodeArgs);
 template void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs);
 template void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs);

-Ref OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2) {
+Ref OpDispatchBuilder::PMADDWDOpImpl(size_t Size, Ref Src1, Ref Src2) {
  // This is a pretty curious operation
  // Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination
  //
@ -3370,42 +3378,42 @@ Ref OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
  //              xmm1[63:32] = (xmm1[47:32] * xmm2[47:32]) + (xmm1[63:48] * xmm2[63:48])
  //              etc.. for larger registers

-  auto Size = GetSrcSize(Op);
-
-  Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
-  Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
-
-  if (Size == 8) {
+  if (Size == OpSize::i64Bit) {
    // MMX implementation can be slightly more optimal
    Size <<= 1;
-    auto MullResult = _VSMull(Size, 2, Src1Node, Src2Node);
+    auto MullResult = _VSMull(Size, 2, Src1, Src2);
    return _VAddP(Size, 4, MullResult, MullResult);
  }

-  auto Lower = _VSMull(Size, 2, Src1Node, Src2Node);
-  auto Upper = _VSMull2(Size, 2, Src1Node, Src2Node);
+  auto Lower = _VSMull(Size, 2, Src1, Src2);
+  auto Upper = _VSMull2(Size, 2, Src1, Src2);

  // [15:0 ] + [31:16], [32:47 ] + [63:48  ], [79:64] + [95:80], [111:96] + [127:112]
  return _VAddP(Size, 4, Lower, Upper);
 }

 void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
-  Ref Result = PMADDWDOpImpl(Op, Op->Dest, Op->Src[0]);
+  const auto Size = GetSrcSize(Op);
+
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  Ref Result = PMADDWDOpImpl(Size, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

 void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) {
-  Ref Result = PMADDWDOpImpl(Op, Op->Src[0], Op->Src[1]);
+  const auto Size = GetSrcSize(Op);
+
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  Ref Result = PMADDWDOpImpl(Size, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

-Ref OpDispatchBuilder::PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) {
-  const auto Size = GetSrcSize(Op);
-
-  Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
-  Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
-
-  if (Size == 8) {
+Ref OpDispatchBuilder::PMADDUBSWOpImpl(size_t Size, Ref Src1, Ref Src2) {
+  if (Size == OpSize::i64Bit) {
    // 64bit is more efficient

    // Src1 is unsigned
@ -3446,12 +3454,22 @@ Ref OpDispatchBuilder::PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOpera
 }

 void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) {
-  Ref Result = PMADDUBSWOpImpl(Op, Op->Dest, Op->Src[0]);
+  const auto Size = GetSrcSize(Op);
+
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

 void OpDispatchBuilder::VPMADDUBSWOp(OpcodeArgs) {
-  Ref Result = PMADDUBSWOpImpl(Op, Op->Src[0], Op->Src[1]);
+  const auto Size = GetSrcSize(Op);
+
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

@ -3683,19 +3701,15 @@ void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) {
  StoreResult(FPRClass, Op, Dest, -1);
 }

-Ref OpDispatchBuilder::PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) {
+Ref OpDispatchBuilder::PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2) {
  // The documentation is actually incorrect in how this instruction operates
  // It strongly implies that the `abs(dest[i] - src[i])` operates in 8bit space
  // but it actually operates in more than 8bit space
  // This can be seen with `abs(0 - 0xFF)` returning a different result depending
  // on bit length
-  const auto Size = GetSrcSize(Op);
  const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;

-  Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
-  Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
-
-  if (Size == 8) {
+  if (Size == OpSize::i64Bit) {
    auto AbsResult = _VUABDL(Size * 2, 1, Src1, Src2);

    // Now vector-wide add the results for each
@ -3727,12 +3741,22 @@ Ref OpDispatchBuilder::PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
 }

 void OpDispatchBuilder::PSADBW(OpcodeArgs) {
-  Ref Result = PSADBWOpImpl(Op, Op->Dest, Op->Src[0]);
+  const auto Size = GetSrcSize(Op);
+
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  Ref Result = PSADBWOpImpl(Size, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

 void OpDispatchBuilder::VPSADBWOp(OpcodeArgs) {
-  Ref Result = PSADBWOpImpl(Op, Op->Src[0], Op->Src[1]);
+  const auto Size = GetSrcSize(Op);
+
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+  Ref Result = PSADBWOpImpl(Size, Src1, Src2);
  StoreResult(FPRClass, Op, Result, -1);
 }

@ -4461,8 +4485,7 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
 template void OpDispatchBuilder::VDPPOp<4>(OpcodeArgs);
 template void OpDispatchBuilder::VDPPOp<8>(OpcodeArgs);

-Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
-                                     const X86Tables::DecodedOperand& ImmOp) {
+Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
  const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) {
    // Src2 will grab a 32bit element and duplicate it across the 128bits
    Ref DupSrc = _VDupElement(16, 4, Src2, Selector_Src2);
@ -4529,18 +4552,12 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
    return _VAddP(16, 2, TmpTranspose1, TmpTranspose2);
  };

-  LOGMAN_THROW_A_FMT(ImmOp.IsLiteral(), "ImmOp needs to be literal here");
-  const uint8_t Select = ImmOp.Data.Literal.Value;
-  const uint8_t SrcSize = GetSrcSize(Op);
  const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;

  // Src1 needs to be in byte offset
  const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8;
  const uint8_t Select_Src2_Low = Select & 0b11;

-  Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
-  Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
-
  Ref Lower = LaneHelper(Select_Src1_Low, Select_Src2_Low, Src1, Src2);
  if (Is128Bit) {
    return Lower;
@ -4556,12 +4573,27 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
 }

 void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
-  Ref Result = MPSADBWOpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1]);
+  LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "ImmOp needs to be literal here");
+
+  const uint8_t Select = Op->Src[1].Data.Literal.Value;
+  const uint8_t SrcSize = GetSrcSize(Op);
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select);
  StoreResult(FPRClass, Op, Result, -1);
 }

 void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) {
-  Ref Result = MPSADBWOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
+  LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "ImmOp needs to be literal here");
+
+  const uint8_t Select = Op->Src[2].Data.Literal.Value;
+  const uint8_t SrcSize = GetSrcSize(Op);
+  Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+  Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+
+  Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select);
  StoreResult(FPRClass, Op, Result, -1);
 }

@ -4707,7 +4739,7 @@ void OpDispatchBuilder::VPERMQOp(OpcodeArgs) {
  StoreResult(FPRClass, Op, Result, -1);
 }

-static Ref VBLENDOpImpl(IREmitter& IR, uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) {
+Ref OpDispatchBuilder::VBLENDOpImpl(uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) {
  const std::array Sources {Src1, Src2};

  Ref Result = ZeroRegister;
@ -4715,7 +4747,7 @@ static Ref VBLENDOpImpl(IREmitter& IR, uint32_t VecSize, uint32_t ElementSize, R
  for (int i = 0; i < NumElements; i++) {
    const auto SelectorIndex = (Selector >> i) & 1;

-    Result = IR._VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]);
+    Result = _VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]);
  }

  return Result;
@ -4744,7 +4776,7 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
  }

  const auto ZeroRegister = LoadZeroVector(DstSize);
-  Ref Result = VBLENDOpImpl(*this, DstSize, 8, Src1, Src2, ZeroRegister, Selector);
+  Ref Result = VBLENDOpImpl(DstSize, 8, Src1, Src2, ZeroRegister, Selector);
  StoreResult(FPRClass, Op, Result, -1);
 }

@ -4787,7 +4819,7 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
  }

  const auto ZeroRegister = LoadZeroVector(DstSize);
-  Ref Result = VBLENDOpImpl(*this, DstSize, 4, Src1, Src2, ZeroRegister, Selector);
+  Ref Result = VBLENDOpImpl(DstSize, 4, Src1, Src2, ZeroRegister, Selector);
  if (!Is256Bit) {
    Result = _VMov(16, Result);
  }
@ -4821,7 +4853,7 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
  const auto NewSelector = Selector << 8 | Selector;

  const auto ZeroRegister = LoadZeroVector(DstSize);
-  Ref Result = VBLENDOpImpl(*this, DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
+  Ref Result = VBLENDOpImpl(DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
  if (Is128Bit) {
    Result = _VMov(16, Result);
  }