diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index bcfb35b16..9159758d8 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -5697,6 +5697,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() { {OPD(1, 0b00, 0x12), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b01, 0x12), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b10, 0x12), 1, &OpDispatchBuilder::VMOVSLDUPOp}, + {OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::VMOVDDUPOp}, {OPD(1, 0b00, 0x13), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b01, 0x13), 1, &OpDispatchBuilder::VMOVLPOp}, diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 0dc2a5238..21842733d 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -411,6 +411,7 @@ public: void VMOVHPOp(OpcodeArgs); void VMOVLPOp(OpcodeArgs); + void VMOVDDUPOp(OpcodeArgs); void VMOVSHDUPOp(OpcodeArgs); void VMOVSLDUPOp(OpcodeArgs); diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 617a33ab5..af04a2f9a 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -1064,6 +1064,23 @@ void OpDispatchBuilder::MOVDDUPOp(OpcodeArgs) { StoreResult(FPRClass, Op, Res, -1); } +void OpDispatchBuilder::VMOVDDUPOp(OpcodeArgs) { + const auto SrcSize = GetSrcSize(Op); + const auto IsSrcGPR = Op->Src[0].IsGPR(); + const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE; + const auto MemSize = Is256Bit ? 32 : 8; + + OrderedNode *Src = IsSrcGPR ? LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, -1) + : LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], MemSize, Op->Flags, -1); + + OrderedNode *Res = _VInsElement(SrcSize, 8, 1, 0, Src, Src); + if (Is256Bit) { + Res = _VInsElement(SrcSize, 8, 3, 2, Res, Src); + } + + StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Res, 32, -1); +} + template void OpDispatchBuilder::CVTGPR_To_FPR(OpcodeArgs) { OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, -1); diff --git a/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp b/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp index 5c23193b2..0e759f5e3 100644 --- a/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp +++ b/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp @@ -30,7 +30,7 @@ void InitializeVEXTables() { {OPD(1, 0b00, 0x12), 1, X86InstInfo{"VMOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC, 0, nullptr}}, {OPD(1, 0b01, 0x12), 1, X86InstInfo{"VMOVLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC, 0, nullptr}}, {OPD(1, 0b10, 0x12), 1, X86InstInfo{"VMOVSLDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}}, - {OPD(1, 0b11, 0x12), 1, X86InstInfo{"VMOVDDUP", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}}, + {OPD(1, 0b11, 0x12), 1, X86InstInfo{"VMOVDDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b00, 0x13), 1, X86InstInfo{"VMOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0x13), 1, X86InstInfo{"VMOVLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0, nullptr}}, diff --git a/unittests/ASM/VEX/vmovddup.asm b/unittests/ASM/VEX/vmovddup.asm new file mode 100644 index 000000000..744918cd4 --- /dev/null +++ b/unittests/ASM/VEX/vmovddup.asm @@ -0,0 +1,41 @@ +%ifdef CONFIG +{ + "HostFeatures": ["AVX"], + "RegData": { + "XMM0": ["0xEEEEEEEEFFFFFFFF", "0xCCCCCCCCDDDDDDDD", "0xAAAAAAAABBBBBBBB", "0x0808080809090909"], + "XMM1": ["0xEEEEEEEEFFFFFFFF", "0xEEEEEEEEFFFFFFFF", "0xAAAAAAAABBBBBBBB", "0xAAAAAAAABBBBBBBB"], + "XMM2": ["0xEEEEEEEEFFFFFFFF", "0xEEEEEEEEFFFFFFFF", "0x0000000000000000", "0x0000000000000000"], + "XMM3": ["0xEEEEEEEEFFFFFFFF", "0xEEEEEEEEFFFFFFFF", "0xAAAAAAAABBBBBBBB", "0xAAAAAAAABBBBBBBB"], + "XMM4": ["0xEEEEEEEEFFFFFFFF", "0xEEEEEEEEFFFFFFFF", "0x0000000000000000", "0x0000000000000000"], + "XMM5": ["0xEEEEEEEEFFFFFFFF", "0xEEEEEEEEFFFFFFFF", "0xAAAAAAAABBBBBBBB", "0xAAAAAAAABBBBBBBB"], + "XMM6": ["0xCCCCCCCCDDDDDDDD", "0xCCCCCCCCDDDDDDDD", "0x0000000000000000", "0x0000000000000000"] + } +} +%endif + +lea rdx, [rel .data] + +;; Register duplication +vmovapd ymm0, [rdx] +vmovddup ymm1, ymm0 +; 128-bit +vmovddup xmm2, xmm0 + +;; Same register +vmovapd ymm3, ymm0 +vmovddup ymm3, ymm3 +; 128-bit +vmovapd ymm4, ymm0 +vmovddup xmm4, xmm4 + +;; From memory +vmovddup ymm5, [rdx] +; 128-bit +vmovddup xmm6, [rdx + 8] + +hlt + +align 32 +.data: +db 0xFF, 0xFF, 0xFF, 0xFF, 0xEE, 0xEE, 0xEE, 0xEE, 0xDD, 0xDD, 0xDD, 0xDD, 0xCC, 0xCC, 0xCC, 0xCC +db 0xBB, 0xBB, 0xBB, 0xBB, 0xAA, 0xAA, 0xAA, 0xAA, 0x09, 0x09, 0x09, 0x09, 0x08, 0x08, 0x08, 0x08