VectorOps: Restructure DPPOpImpl. This will get reused by AVX128

2024-12-14 09:28:34 +00:00 · 2024-06-19 03:21:33 -07:00 · 2024-06-19 03:21:33 -07:00 · cc168ce0fb
commit cc168ce0fb
parent 76bd22d279
2 changed files with 12 additions and 12 deletions
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@ -1247,8 +1247,7 @@ private:

  Ref CVTGPR_To_FPRImpl(OpcodeArgs, size_t DstElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);

-  Ref DPPOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
-                const X86Tables::DecodedOperand& Imm, size_t ElementSize);
+  Ref DPPOpImpl(size_t DstSize, Ref Src1, Ref Src2, uint8_t Mask, size_t ElementSize);

  Ref VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm);

--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -4099,9 +4099,7 @@ void OpDispatchBuilder::PHMINPOSUWOp(OpcodeArgs) {
  StoreResult(FPRClass, Op, Result, -1);
 }

-Ref OpDispatchBuilder::DPPOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
-                                 const X86Tables::DecodedOperand& Imm, size_t ElementSize) {
-  const uint8_t Mask = Imm.Literal();
+Ref OpDispatchBuilder::DPPOpImpl(size_t DstSize, Ref Src1, Ref Src2, uint8_t Mask, size_t ElementSize) {
  const auto SizeMask = [ElementSize]() {
    if (ElementSize == 4) {
      return 0b1111;
@ -4119,7 +4117,6 @@ Ref OpDispatchBuilder::DPPOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Sr

    return FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPD_MASK;
  }();
-  const auto DstSize = GetDstSize(Op);

  Ref ZeroVec = LoadZeroVector(DstSize);
  if (SrcMask == 0 || DstMask == 0) {
@ -4127,11 +4124,8 @@ Ref OpDispatchBuilder::DPPOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Sr
    return ZeroVec;
  }

-  Ref Src1V = LoadSource(FPRClass, Op, Src1, Op->Flags);
-  Ref Src2V = LoadSource(FPRClass, Op, Src2, Op->Flags);
-
  // First step is to do an FMUL
-  Ref Temp = _VFMul(DstSize, ElementSize, Src1V, Src2V);
+  Ref Temp = _VFMul(DstSize, ElementSize, Src1, Src2);

  // Now mask results based on IndexMask.
  if (SrcMask != SizeMask) {
@ -4283,7 +4277,11 @@ Ref OpDispatchBuilder::DPPOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Sr

 template<size_t ElementSize>
 void OpDispatchBuilder::DPPOp(OpcodeArgs) {
-  Ref Result = DPPOpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1], ElementSize);
+
+  Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
+  Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+
+  Ref Result = DPPOpImpl(GetDstSize(Op), Dest, Src, Op->Src[1].Literal(), ElementSize);
  StoreResult(FPRClass, Op, Result, -1);
 }

@ -4349,7 +4347,10 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
    // 256-bit DPPS isn't handled by the 128-bit solution.
    Result = VDPPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
  } else {
-    Result = DPPOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2], ElementSize);
+    Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
+    Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
+
+    Result = DPPOpImpl(GetDstSize(Op), Src1, Src2, Op->Src[2].Literal(), ElementSize);
  }

  // We don't need to emit a _VMov to clear the upper lane, since DPPOpImpl uses a zero vector