Merge pull request #2408 from lioncash/pmaddwd

OpcodeDispatcher: Handle VPMADDWD
This commit is contained in:
Ryan Houdek 2023-02-14 17:52:57 -08:00 committed by GitHub
commit efafe0e6e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 10 deletions

View File

@ -6010,6 +6010,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::VPSLLOp<4>},
{OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::VPSLLOp<8>},
{OPD(1, 0b01, 0xF4), 1, &OpDispatchBuilder::VPMULLOp<4, false>},
{OPD(1, 0b01, 0xF5), 1, &OpDispatchBuilder::VPMADDWDOp},
{OPD(1, 0b01, 0xF7), 1, &OpDispatchBuilder::MASKMOVOp},
{OPD(1, 0b01, 0xF8), 1, &OpDispatchBuilder::AVXVectorALUOp<IR::OP_VSUB, 1>},

View File

@ -485,6 +485,8 @@ public:
template <size_t ElementSize>
void VPHSUBOp(OpcodeArgs);
void VPMADDWDOp(OpcodeArgs);
void VPMULHRSWOp(OpcodeArgs);
template <bool Signed>
@ -809,6 +811,9 @@ private:
OrderedNode* PHSUBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1,
const X86Tables::DecodedOperand& Src2, size_t ElementSize);
OrderedNode* PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1,
const X86Tables::DecodedOperand& Src2);
OrderedNode* PMULHRSWOpImpl(OpcodeArgs, OrderedNode *Src1, OrderedNode *Src2);
OrderedNode* PMULHWOpImpl(OpcodeArgs, bool Signed,

View File

@ -2898,7 +2898,8 @@ void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs);
template
void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs);
void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
OrderedNode* OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1,
const X86Tables::DecodedOperand& Src2) {
// This is a pretty curious operation
// Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination
//
@ -2909,25 +2910,34 @@ void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
auto Size = GetSrcSize(Op);
OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags, -1);
OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags, -1);
OrderedNode *Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags, -1);
if (Size == 8) {
Size <<= 1;
}
auto Src1_L = _VSXTL(Size, 2, Src1); // [15:0 ], [31:16], [32:47 ], [63:48 ]
auto Src1_H = _VSXTL2(Size, 2, Src1); // [79:64], [95:80], [111:96], [127:112]
auto Src1_L = _VSXTL(Size, 2, Src1Node); // [15:0 ], [31:16], [32:47 ], [63:48 ]
auto Src1_H = _VSXTL2(Size, 2, Src1Node); // [79:64], [95:80], [111:96], [127:112]
auto Src2_L = _VSXTL(Size, 2, Src2); // [15:0 ], [31:16], [32:47 ], [63:48 ]
auto Src2_H = _VSXTL2(Size, 2, Src2); // [79:64], [95:80], [111:96], [127:112]
auto Src2_L = _VSXTL(Size, 2, Src2Node); // [15:0 ], [31:16], [32:47 ], [63:48 ]
auto Src2_H = _VSXTL2(Size, 2, Src2Node); // [79:64], [95:80], [111:96], [127:112]
auto Res_L = _VSMul(Size, 4, Src1_L, Src2_L); // [15:0 ], [31:16], [32:47 ], [63:48 ] : Original elements
auto Res_H = _VSMul(Size, 4, Src1_H, Src2_H); // [79:64], [95:80], [111:96], [127:112] : Original elements
// [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112]
auto Res = _VAddP(Size, 4, Res_L, Res_H);
StoreResult(FPRClass, Op, Res, -1);
return _VAddP(Size, 4, Res_L, Res_H);
}
void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
OrderedNode *Result = PMADDWDOpImpl(Op, Op->Dest, Op->Src[0]);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) {
OrderedNode *Result = PMADDWDOpImpl(Op, Op->Src[0], Op->Src[1]);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) {

View File

@ -246,7 +246,7 @@ void InitializeVEXTables() {
{OPD(1, 0b01, 0xF2), 1, X86InstInfo{"VPSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(1, 0b01, 0xF3), 1, X86InstInfo{"VPSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(1, 0b01, 0xF4), 1, X86InstInfo{"VPMULUDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(1, 0b01, 0xF6), 1, X86InstInfo{"VPSADBW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(1, 0b01, 0xF7), 1, X86InstInfo{"VMASKMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 0, nullptr}},

View File

@ -0,0 +1,36 @@
%ifdef CONFIG
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM2": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x0000000000000000", "0x0000000000000000"],
"XMM4": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x41FD357ADA74036A", "0xCCCC999AE38E1C72"],
"XMM5": ["0x32F08FD4383B2524", "0x499DE6944FEA7CE4", "0x41FD357ADA74036A", "0xCCCC999AE38E1C72"]
}
}
%endif
lea rdx, [rel .data]
vmovaps ymm0, [rdx]
vmovaps ymm1, [rdx + 32]
vpmaddwd xmm2, xmm0, [rdx + 32]
vpmaddwd xmm3, xmm0, xmm1
vpmaddwd ymm4, ymm0, [rdx + 32]
vpmaddwd ymm5, ymm0, ymm1
hlt
align 32
.data:
dq 0x4142434445464748
dq 0x5152535455565758
dq 0x6666777788889999
dq 0x5555444433332222
dq 0x6162636465666768
dq 0x7172737475767778
dq 0x5555444433332222
dq 0xAAAAAAAAAAAAAAAA