diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 77eb99dac..b7c2185e5 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -6010,6 +6010,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() { {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::VPSLLOp<4>}, {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::VPSLLOp<8>}, {OPD(1, 0b01, 0xF4), 1, &OpDispatchBuilder::VPMULLOp<4, false>}, + {OPD(1, 0b01, 0xF5), 1, &OpDispatchBuilder::VPMADDWDOp}, {OPD(1, 0b01, 0xF7), 1, &OpDispatchBuilder::MASKMOVOp}, {OPD(1, 0b01, 0xF8), 1, &OpDispatchBuilder::AVXVectorALUOp}, diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index f99a9bf44..deb33e0d6 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -485,6 +485,8 @@ public: template void VPHSUBOp(OpcodeArgs); + void VPMADDWDOp(OpcodeArgs); + void VPMULHRSWOp(OpcodeArgs); template @@ -809,6 +811,9 @@ private: OrderedNode* PHSUBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, size_t ElementSize); + OrderedNode* PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, + const X86Tables::DecodedOperand& Src2); + OrderedNode* PMULHRSWOpImpl(OpcodeArgs, OrderedNode *Src1, OrderedNode *Src2); OrderedNode* PMULHWOpImpl(OpcodeArgs, bool Signed, diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index bfee131c1..32c3382b9 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -2898,7 +2898,8 @@ void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs); template void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs); -void OpDispatchBuilder::PMADDWD(OpcodeArgs) { +OrderedNode* OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, + const X86Tables::DecodedOperand& Src2) { // This is a pretty curious operation // Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination // @@ -2909,25 +2910,34 @@ void OpDispatchBuilder::PMADDWD(OpcodeArgs) { auto Size = GetSrcSize(Op); - OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags, -1); - OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1); + OrderedNode *Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags, -1); + OrderedNode *Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags, -1); if (Size == 8) { Size <<= 1; } - auto Src1_L = _VSXTL(Size, 2, Src1); // [15:0 ], [31:16], [32:47 ], [63:48 ] - auto Src1_H = _VSXTL2(Size, 2, Src1); // [79:64], [95:80], [111:96], [127:112] + auto Src1_L = _VSXTL(Size, 2, Src1Node); // [15:0 ], [31:16], [32:47 ], [63:48 ] + auto Src1_H = _VSXTL2(Size, 2, Src1Node); // [79:64], [95:80], [111:96], [127:112] - auto Src2_L = _VSXTL(Size, 2, Src2); // [15:0 ], [31:16], [32:47 ], [63:48 ] - auto Src2_H = _VSXTL2(Size, 2, Src2); // [79:64], [95:80], [111:96], [127:112] + auto Src2_L = _VSXTL(Size, 2, Src2Node); // [15:0 ], [31:16], [32:47 ], [63:48 ] + auto Src2_H = _VSXTL2(Size, 2, Src2Node); // [79:64], [95:80], [111:96], [127:112] auto Res_L = _VSMul(Size, 4, Src1_L, Src2_L); // [15:0 ], [31:16], [32:47 ], [63:48 ] : Original elements auto Res_H = _VSMul(Size, 4, Src1_H, Src2_H); // [79:64], [95:80], [111:96], [127:112] : Original elements // [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112] - auto Res = _VAddP(Size, 4, Res_L, Res_H); - StoreResult(FPRClass, Op, Res, -1); + return _VAddP(Size, 4, Res_L, Res_H); +} + +void OpDispatchBuilder::PMADDWD(OpcodeArgs) { + OrderedNode *Result = PMADDWDOpImpl(Op, Op->Dest, Op->Src[0]); + StoreResult(FPRClass, Op, Result, -1); +} + +void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) { + OrderedNode *Result = PMADDWDOpImpl(Op, Op->Src[0], Op->Src[1]); + StoreResult(FPRClass, Op, Result, -1); } void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) { diff --git a/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp b/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp index 2534a81e2..284c33fc7 100644 --- a/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp +++ b/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp @@ -246,7 +246,7 @@ void InitializeVEXTables() { {OPD(1, 0b01, 0xF2), 1, X86InstInfo{"VPSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0xF3), 1, X86InstInfo{"VPSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0xF4), 1, X86InstInfo{"VPMULUDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, - {OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}}, + {OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0xF6), 1, X86InstInfo{"VPSADBW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}}, {OPD(1, 0b01, 0xF7), 1, X86InstInfo{"VMASKMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 0, nullptr}}, diff --git a/unittests/ASM/VEX/vpmaddwd.asm b/unittests/ASM/VEX/vpmaddwd.asm new file mode 100644 index 000000000..0b0a5e154 --- /dev/null +++ b/unittests/ASM/VEX/vpmaddwd.asm @@ -0,0 +1,36 @@ +%ifdef CONFIG +{ + "HostFeatures": ["AVX"], + "RegData": { + "XMM2": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x0000000000000000", "0x0000000000000000"], + "XMM3": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x0000000000000000", "0x0000000000000000"], + "XMM4": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x41FD357ADA74036A", "0xCCCC999AE38E1C72"], + "XMM5": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x41FD357ADA74036A", "0xCCCC999AE38E1C72"] + } +} +%endif + +lea rdx, [rel .data] + +vmovaps ymm0, [rdx] +vmovaps ymm1, [rdx + 32] + +vpmaddwd xmm2, xmm0, [rdx + 32] +vpmaddwd xmm3, xmm0, xmm1 + +vpmaddwd ymm4, ymm0, [rdx + 32] +vpmaddwd ymm5, ymm0, ymm1 + +hlt + +align 32 +.data: +dq 0x4142434445464748 +dq 0x5152535455565758 +dq 0x6666777788889999 +dq 0x5555444433332222 + +dq 0x6162636465666768 +dq 0x7172737475767778 +dq 0x5555444433332222 +dq 0xAAAAAAAAAAAAAAAA