From fb2de47e73ba96688627a5ec227380b5709eb791 Mon Sep 17 00:00:00 2001 From: Lioncache Date: Tue, 14 Feb 2023 18:38:09 -0500 Subject: [PATCH 1/2] OpcodeDispatcher: Factor out PMADDWD implementation to helper This will be used to centralize code to also implement the AVX variant. --- .../Source/Interface/Core/OpcodeDispatcher.h | 3 +++ .../Core/OpcodeDispatcher/Vector.cpp | 23 +++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index d79c24637..7979e6dd5 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -806,6 +806,9 @@ private: OrderedNode* PHSUBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, size_t ElementSize); + OrderedNode* PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, + const X86Tables::DecodedOperand& Src2); + OrderedNode* PMULHRSWOpImpl(OpcodeArgs, OrderedNode *Src1, OrderedNode *Src2); OrderedNode* PMULHWOpImpl(OpcodeArgs, bool Signed, diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 0f1f897b7..b82b2fa12 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -2867,7 +2867,8 @@ void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs); template void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs); -void OpDispatchBuilder::PMADDWD(OpcodeArgs) { +OrderedNode* OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, + const X86Tables::DecodedOperand& Src2) { // This is a pretty curious operation // Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination // @@ -2878,25 +2879,29 @@ void OpDispatchBuilder::PMADDWD(OpcodeArgs) { auto Size = GetSrcSize(Op); - OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags, -1); - OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1); + OrderedNode *Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags, -1); + OrderedNode *Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags, -1); if (Size == 8) { Size <<= 1; } - auto Src1_L = _VSXTL(Size, 2, Src1); // [15:0 ], [31:16], [32:47 ], [63:48 ] - auto Src1_H = _VSXTL2(Size, 2, Src1); // [79:64], [95:80], [111:96], [127:112] + auto Src1_L = _VSXTL(Size, 2, Src1Node); // [15:0 ], [31:16], [32:47 ], [63:48 ] + auto Src1_H = _VSXTL2(Size, 2, Src1Node); // [79:64], [95:80], [111:96], [127:112] - auto Src2_L = _VSXTL(Size, 2, Src2); // [15:0 ], [31:16], [32:47 ], [63:48 ] - auto Src2_H = _VSXTL2(Size, 2, Src2); // [79:64], [95:80], [111:96], [127:112] + auto Src2_L = _VSXTL(Size, 2, Src2Node); // [15:0 ], [31:16], [32:47 ], [63:48 ] + auto Src2_H = _VSXTL2(Size, 2, Src2Node); // [79:64], [95:80], [111:96], [127:112] auto Res_L = _VSMul(Size, 4, Src1_L, Src2_L); // [15:0 ], [31:16], [32:47 ], [63:48 ] : Original elements auto Res_H = _VSMul(Size, 4, Src1_H, Src2_H); // [79:64], [95:80], [111:96], [127:112] : Original elements // [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112] - auto Res = _VAddP(Size, 4, Res_L, Res_H); - StoreResult(FPRClass, Op, Res, -1); + return _VAddP(Size, 4, Res_L, Res_H); +} + +void OpDispatchBuilder::PMADDWD(OpcodeArgs) { + OrderedNode *Result = PMADDWDOpImpl(Op, Op->Dest, Op->Src[0]); + StoreResult(FPRClass, Op, Result, -1); } void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) { From 4a69b87cb9c5228418341030e1a278bff1036478 Mon Sep 17 00:00:00 2001 From: Lioncache Date: Tue, 14 Feb 2023 18:39:55 -0500 Subject: [PATCH 2/2] OpcodeDispatcher: Handle VPMADDWD --- .../Interface/Core/OpcodeDispatcher.cpp | 1 + .../Source/Interface/Core/OpcodeDispatcher.h | 2 ++ .../Core/OpcodeDispatcher/Vector.cpp | 5 +++ .../Interface/Core/X86Tables/VEXTables.cpp | 2 +- unittests/ASM/VEX/vpmaddwd.asm | 36 +++++++++++++++++++ 5 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 unittests/ASM/VEX/vpmaddwd.asm diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index d3a1cb9d1..10aa2bc1a 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -6007,6 +6007,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() { {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::VPSLLOp<4>}, {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::VPSLLOp<8>}, {OPD(1, 0b01, 0xF4), 1, &OpDispatchBuilder::VPMULLOp<4, false>}, + {OPD(1, 0b01, 0xF5), 1, &OpDispatchBuilder::VPMADDWDOp}, {OPD(1, 0b01, 0xF7), 1, &OpDispatchBuilder::MASKMOVOp}, {OPD(1, 0b01, 0xF8), 1, &OpDispatchBuilder::AVXVectorALUOp}, diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 7979e6dd5..cfebecc5e 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -485,6 +485,8 @@ public: template void VPHSUBOp(OpcodeArgs); + void VPMADDWDOp(OpcodeArgs); + void VPMULHRSWOp(OpcodeArgs); template diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index b82b2fa12..cb4e345c9 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -2904,6 +2904,11 @@ void OpDispatchBuilder::PMADDWD(OpcodeArgs) { StoreResult(FPRClass, Op, Result, -1); } +void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) { + OrderedNode *Result = PMADDWDOpImpl(Op, Op->Src[0], Op->Src[1]); + StoreResult(FPRClass, Op, Result, -1); +} + void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) { // This is a pretty curious operation // Does four MADD operations across 8 8bit signed and unsigned integers and accumulates to 16bit integers in the destination WITH saturation diff --git a/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp b/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp index 4a5459fe2..6e1f41ae5 100644 --- a/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp +++ b/External/FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp @@ -246,7 +246,7 @@ void InitializeVEXTables() { {OPD(1, 0b01, 0xF2), 1, X86InstInfo{"VPSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0xF3), 1, X86InstInfo{"VPSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0xF4), 1, X86InstInfo{"VPMULUDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, - {OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}}, + {OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}}, {OPD(1, 0b01, 0xF6), 1, X86InstInfo{"VPSADBW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}}, {OPD(1, 0b01, 0xF7), 1, X86InstInfo{"VMASKMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 0, nullptr}}, diff --git a/unittests/ASM/VEX/vpmaddwd.asm b/unittests/ASM/VEX/vpmaddwd.asm new file mode 100644 index 000000000..0b0a5e154 --- /dev/null +++ b/unittests/ASM/VEX/vpmaddwd.asm @@ -0,0 +1,36 @@ +%ifdef CONFIG +{ + "HostFeatures": ["AVX"], + "RegData": { + "XMM2": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x0000000000000000", "0x0000000000000000"], + "XMM3": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x0000000000000000", "0x0000000000000000"], + "XMM4": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x41FD357ADA74036A", "0xCCCC999AE38E1C72"], + "XMM5": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x41FD357ADA74036A", "0xCCCC999AE38E1C72"] + } +} +%endif + +lea rdx, [rel .data] + +vmovaps ymm0, [rdx] +vmovaps ymm1, [rdx + 32] + +vpmaddwd xmm2, xmm0, [rdx + 32] +vpmaddwd xmm3, xmm0, xmm1 + +vpmaddwd ymm4, ymm0, [rdx + 32] +vpmaddwd ymm5, ymm0, ymm1 + +hlt + +align 32 +.data: +dq 0x4142434445464748 +dq 0x5152535455565758 +dq 0x6666777788889999 +dq 0x5555444433332222 + +dq 0x6162636465666768 +dq 0x7172737475767778 +dq 0x5555444433332222 +dq 0xAAAAAAAAAAAAAAAA