AVX128: Implement support for vmovddup

This instruction is a little weird. When accessing memory, the 128-bit operating size of the instruction only loads 64-bits. Meanwhile the 256-bit operating size of the instruction fetches a full 256-bits. Theoretically the hardware could get away with two 64-bit loads or a wacky 24-byte load, but it looks like to simplify hardware they just spec'd it that the 256-bit version will always load the full range.
2025-02-08 23:57:05 +00:00 · 2024-06-17 20:12:54 -07:00 · 2024-06-17 20:12:54 -07:00 · 96aafb4f07
commit 96aafb4f07
parent dbaf95a8f3
2 changed files with 37 additions and 1 deletions
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@ -1006,6 +1006,7 @@ public:
  void AVX128_MOVQ(OpcodeArgs);
  void AVX128_VMOVLP(OpcodeArgs);
  void AVX128_VMOVHP(OpcodeArgs);
+  void AVX128_VMOVDDUP(OpcodeArgs);

  // End of AVX 128-bit implementation

--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
@ -36,7 +36,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(1, 0b00, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVLP},
    {OPD(1, 0b01, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVLP},
    // TODO: {OPD(1, 0b10, 0x12), 1, &OpDispatchBuilder::VMOVSLDUPOp},
-    // TODO: {OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::VMOVDDUPOp},
+    {OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVDDUP},
    {OPD(1, 0b00, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP},
    {OPD(1, 0b01, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP},

@ -710,4 +710,39 @@ void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) {
  }
 }

+void OpDispatchBuilder::AVX128_VMOVDDUP(OpcodeArgs) {
+  const auto SrcSize = GetSrcSize(Op);
+  const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  const auto IsSrcGPR = Op->Src[0].IsGPR();
+
+  RefPair Src {};
+  if (IsSrcGPR) {
+    Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
+  } else {
+    // Accesses from memory are a little weird.
+    // 128-bit operation only loads 8-bytes.
+    // 256-bit operation loads a full 32-bytes.
+    if (Is128Bit) {
+      Src.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], OpSize::i64Bit, Op->Flags);
+    } else {
+      Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true);
+    }
+  }
+
+  if (Is128Bit) {
+    // Duplicate Src[63:0] in to low 128-bits
+    auto Result_Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0);
+    Ref ZeroVector = LoadZeroVector(OpSize::i128Bit);
+
+    AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector});
+  } else {
+    // Duplicate Src.Low[63:0] in to low 128-bits
+    auto Result_Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0);
+    // Duplicate Src.High[63:0] in to high 128-bits
+    auto Result_High = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 0);
+    AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = Result_High});
+  }
+}
+
 } // namespace FEXCore::IR