Merge pull request #2251 from lioncash/hadd

OpcodeDispatcher: Handle VPHADDW/VPHADDD
This commit is contained in:
Ryan Houdek 2022-12-15 20:11:07 -08:00 committed by GitHub
commit 8ce6c08152
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 155 additions and 38 deletions

View File

@ -507,26 +507,26 @@ DEF_OP(VAddP) {
switch (ElementSize) {
case 1: {
addp(VTMP1.Z().VnB(), Pred, VTMP1.Z().VnB(), VectorUpper.Z().VnB());
uzp1(VTMP2.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
uzp2(VTMP3.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
uzp1(Dst.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
uzp2(VTMP2.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
break;
}
case 2: {
addp(VTMP1.Z().VnH(), Pred, VTMP1.Z().VnH(), VectorUpper.Z().VnH());
uzp1(VTMP2.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
uzp2(VTMP3.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
uzp1(Dst.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
uzp2(VTMP2.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
break;
}
case 4: {
addp(VTMP1.Z().VnS(), Pred, VTMP1.Z().VnS(), VectorUpper.Z().VnS());
uzp1(VTMP2.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
uzp2(VTMP3.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
uzp1(Dst.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
uzp2(VTMP2.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
break;
}
case 8: {
addp(VTMP1.Z().VnD(), Pred, VTMP1.Z().VnD(), VectorUpper.Z().VnD());
uzp1(VTMP2.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
uzp2(VTMP3.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
uzp1(Dst.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
uzp2(VTMP2.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
break;
}
default:
@ -534,13 +534,8 @@ DEF_OP(VAddP) {
return;
}
// Shift the entire vector over by 128 bits.
mov(TMP1, 0);
insr(VTMP3.Z().VnD(), TMP1.X());
insr(VTMP3.Z().VnD(), TMP1.X());
// Now combine the lower and upper halves.
orr(Dst.Z().VnD(), VTMP2.Z().VnD(), VTMP3.Z().VnD());
// Merge upper half with lower half.
splice(Dst.Z().VnD(), PRED_TMP_16B, Dst.Z().VnD(), VTMP2.Z().VnD());
} else {
if (IsScalar) {
switch (ElementSize) {

View File

@ -433,6 +433,7 @@ DEF_OP(VAddP) {
const auto Op = IROp->C<IR::IROp_VAddP>();
const auto OpSize = IROp->Size;
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
const auto ElementSize = Op->Header.ElementSize;
const auto Dst = GetDst(Node);
@ -478,30 +479,64 @@ DEF_OP(VAddP) {
const auto VectorLowerYMM = ToYMM(VectorLower);
const auto VectorUpperYMM = ToYMM(VectorUpper);
// To behave like ADDP, we need to swap the second and third elements around
// in the 256-bit case. ADDP operates as if both vectors are concatenated
// together and runs down the length of it adding pairs as it goes, whereas
// VPHADDW/D operates on both individual halves of the entire register.
switch (ElementSize) {
case 1:
vmovdqu(ymm15, VectorLowerYMM);
vmovdqu(ymm14, VectorUpperYMM);
if (Is256Bit) {
vmovdqu(ymm15, VectorLowerYMM);
vmovdqu(ymm14, VectorUpperYMM);
vpunpcklbw(ymm0, ymm15, ymm14);
vpunpckhbw(ymm12, ymm15, ymm14);
vpunpcklbw(ymm0, ymm15, ymm14);
vpunpckhbw(ymm12, ymm15, ymm14);
vpunpcklbw(ymm15, ymm0, ymm12);
vpunpckhbw(ymm14, ymm0, ymm12);
vpunpcklbw(ymm15, ymm0, ymm12);
vpunpckhbw(ymm14, ymm0, ymm12);
vpunpcklbw(ymm0, ymm15, ymm14);
vpunpckhbw(ymm12, ymm15, ymm14);
vpunpcklbw(ymm0, ymm15, ymm14);
vpunpckhbw(ymm12, ymm15, ymm14);
vpunpcklbw(ymm15, ymm0, ymm12);
vpunpckhbw(ymm14, ymm0, ymm12);
vpunpcklbw(ymm15, ymm0, ymm12);
vpunpckhbw(ymm14, ymm0, ymm12);
vpaddb(DstYMM, ymm15, ymm14);
vpaddb(DstYMM, ymm15, ymm14);
vpermq(DstYMM, DstYMM, 0b11'01'10'00);
} else {
vmovdqu(xmm15, VectorLower);
vmovdqu(xmm14, VectorUpper);
vpunpcklbw(xmm0, xmm15, xmm14);
vpunpckhbw(xmm12, xmm15, xmm14);
vpunpcklbw(xmm15, xmm0, xmm12);
vpunpckhbw(xmm14, xmm0, xmm12);
vpunpcklbw(xmm0, xmm15, xmm14);
vpunpckhbw(xmm12, xmm15, xmm14);
vpunpcklbw(xmm15, xmm0, xmm12);
vpunpckhbw(xmm14, xmm0, xmm12);
vpaddb(Dst, xmm15, xmm14);
}
break;
case 2:
vphaddw(DstYMM, VectorLowerYMM, VectorUpperYMM);
if (Is256Bit) {
vphaddw(DstYMM, VectorLowerYMM, VectorUpperYMM);
vpermq(DstYMM, DstYMM, 0b11'01'10'00);
} else {
vphaddw(Dst, VectorLower, VectorUpper);
}
break;
case 4:
vphaddd(DstYMM, VectorLowerYMM, VectorUpperYMM);
if (Is256Bit) {
vphaddd(DstYMM, VectorLowerYMM, VectorUpperYMM);
vpermq(DstYMM, DstYMM, 0b11'01'10'00);
} else {
vphaddd(Dst, VectorLower, VectorUpper);
}
break;
default:
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);

View File

@ -5903,8 +5903,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b00, 0x77), 1, &OpDispatchBuilder::VZEROOp},
{OPD(1, 0b01, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<8>},
{OPD(1, 0b11, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<4>},
{OPD(1, 0b01, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 8>},
{OPD(1, 0b11, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 4>},
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::MOVQOp},
@ -5948,6 +5948,9 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0xFD), 1, &OpDispatchBuilder::AVXVectorALUOp<IR::OP_VADD, 2>},
{OPD(1, 0b01, 0xFE), 1, &OpDispatchBuilder::AVXVectorALUOp<IR::OP_VADD, 4>},
{OPD(2, 0b01, 0x01), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 2>},
{OPD(2, 0b01, 0x02), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 4>},
{OPD(2, 0b01, 0x08), 1, &OpDispatchBuilder::VPSIGN<1>},
{OPD(2, 0b01, 0x09), 1, &OpDispatchBuilder::VPSIGN<2>},
{OPD(2, 0b01, 0x0A), 1, &OpDispatchBuilder::VPSIGN<4>},

View File

@ -421,7 +421,7 @@ public:
template <size_t ElementSize>
void VBROADCASTOp(OpcodeArgs);
template <size_t ElementSize>
template <IROps IROp, size_t ElementSize>
void VHADDPOp(OpcodeArgs);
void VINSERTOp(OpcodeArgs);

View File

@ -1019,7 +1019,7 @@ void OpDispatchBuilder::VANDNOp(OpcodeArgs) {
StoreResult(FPRClass, Op, Dest, -1);
}
template <size_t ElementSize>
template <IROps IROp, size_t ElementSize>
void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
const auto SrcSize = GetSrcSize(Op);
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
@ -1027,7 +1027,9 @@ void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags, -1);
OrderedNode *Res = _VFAddP(SrcSize, ElementSize, Src1, Src2);
auto Res = _VFAddP(SrcSize, ElementSize, Src1, Src2);
Res.first->Header.Op = IROp;
OrderedNode *Dest{};
if (Is256Bit) {
@ -1041,9 +1043,13 @@ void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
}
template
void OpDispatchBuilder::VHADDPOp<4>(OpcodeArgs);
void OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 2>(OpcodeArgs);
template
void OpDispatchBuilder::VHADDPOp<8>(OpcodeArgs);
void OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 4>(OpcodeArgs);
template
void OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 4>(OpcodeArgs);
template
void OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 8>(OpcodeArgs);
template <size_t ElementSize>
void OpDispatchBuilder::VBROADCASTOp(OpcodeArgs) {

View File

@ -260,9 +260,9 @@ void InitializeVEXTables() {
{OPD(1, 0b01, 0xFE), 1, X86InstInfo{"VPADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
// VEX Map 2
{OPD(2, 0b01, 0x00), 1, X86InstInfo{"VPSHUFB", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x01), 1, X86InstInfo{"VPADDW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(2, 0b01, 0x02), 1, X86InstInfo{"VPHADDD", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(2, 0b01, 0x00), 1, X86InstInfo{"VPSHUFB", TYPE_UNDEC, FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x01), 1, X86InstInfo{"VPHADDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x02), 1, X86InstInfo{"VPHADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
{OPD(2, 0b01, 0x03), 1, X86InstInfo{"VPHADDSW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(2, 0b01, 0x04), 1, X86InstInfo{"VPMADDUBSW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(2, 0b01, 0x05), 1, X86InstInfo{"VPHSUBW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},

View File

@ -0,0 +1,39 @@
%ifdef CONFIG
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4142434445464748", "0x5152535455565758", "0x4142434445464748", "0x5152535455565758"],
"XMM1": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
"XMM2": ["0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C", "0x0000000000000000", "0x0000000000000000"],
"XMM4": ["0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC"],
"XMM5": ["0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C"]
}
}
%endif
lea rdx, [rel .data]
vmovaps ymm0, [rdx]
vmovaps ymm1, [rdx + 32]
; Memory Operands
vphaddd xmm2, xmm0, [rdx + 32]
vphaddd xmm3, xmm1, [rdx]
vphaddd ymm4, ymm0, [rdx + 32]
vphaddd ymm5, ymm1, [rdx]
hlt
align 32
.data:
dq 0x4142434445464748
dq 0x5152535455565758
dq 0x4142434445464748
dq 0x5152535455565758
dq 0x6162636465666768
dq 0x7172737475767778
dq 0x6162636465666768
dq 0x7172737475767778

View File

@ -0,0 +1,39 @@
%ifdef CONFIG
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4142434445464748", "0x5152535455565758", "0x4142434445464748", "0x5152535455565758"],
"XMM1": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
"XMM2": ["0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E", "0x0000000000000000", "0x0000000000000000"],
"XMM4": ["0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE"],
"XMM5": ["0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E"]
}
}
%endif
lea rdx, [rel .data]
vmovaps ymm0, [rdx]
vmovaps ymm1, [rdx + 32]
; Memory Operands
vphaddw xmm2, xmm0, [rdx + 32]
vphaddw xmm3, xmm1, [rdx]
vphaddw ymm4, ymm0, [rdx + 32]
vphaddw ymm5, ymm1, [rdx]
hlt
align 32
.data:
dq 0x4142434445464748
dq 0x5152535455565758
dq 0x4142434445464748
dq 0x5152535455565758
dq 0x6162636465666768
dq 0x7172737475767778
dq 0x6162636465666768
dq 0x7172737475767778