mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-01-10 15:50:18 +00:00
Merge pull request #2251 from lioncash/hadd
OpcodeDispatcher: Handle VPHADDW/VPHADDD
This commit is contained in:
commit
8ce6c08152
@ -507,26 +507,26 @@ DEF_OP(VAddP) {
|
||||
switch (ElementSize) {
|
||||
case 1: {
|
||||
addp(VTMP1.Z().VnB(), Pred, VTMP1.Z().VnB(), VectorUpper.Z().VnB());
|
||||
uzp1(VTMP2.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
|
||||
uzp2(VTMP3.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
|
||||
uzp1(Dst.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
|
||||
uzp2(VTMP2.Z().VnB(), VTMP1.Z().VnB(), VTMP1.Z().VnB());
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
addp(VTMP1.Z().VnH(), Pred, VTMP1.Z().VnH(), VectorUpper.Z().VnH());
|
||||
uzp1(VTMP2.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
|
||||
uzp2(VTMP3.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
|
||||
uzp1(Dst.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
|
||||
uzp2(VTMP2.Z().VnH(), VTMP1.Z().VnH(), VTMP1.Z().VnH());
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
addp(VTMP1.Z().VnS(), Pred, VTMP1.Z().VnS(), VectorUpper.Z().VnS());
|
||||
uzp1(VTMP2.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
|
||||
uzp2(VTMP3.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
|
||||
uzp1(Dst.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
|
||||
uzp2(VTMP2.Z().VnS(), VTMP1.Z().VnS(), VTMP1.Z().VnS());
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
addp(VTMP1.Z().VnD(), Pred, VTMP1.Z().VnD(), VectorUpper.Z().VnD());
|
||||
uzp1(VTMP2.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
|
||||
uzp2(VTMP3.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
|
||||
uzp1(Dst.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
|
||||
uzp2(VTMP2.Z().VnD(), VTMP1.Z().VnD(), VTMP1.Z().VnD());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@ -534,13 +534,8 @@ DEF_OP(VAddP) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Shift the entire vector over by 128 bits.
|
||||
mov(TMP1, 0);
|
||||
insr(VTMP3.Z().VnD(), TMP1.X());
|
||||
insr(VTMP3.Z().VnD(), TMP1.X());
|
||||
|
||||
// Now combine the lower and upper halves.
|
||||
orr(Dst.Z().VnD(), VTMP2.Z().VnD(), VTMP3.Z().VnD());
|
||||
// Merge upper half with lower half.
|
||||
splice(Dst.Z().VnD(), PRED_TMP_16B, Dst.Z().VnD(), VTMP2.Z().VnD());
|
||||
} else {
|
||||
if (IsScalar) {
|
||||
switch (ElementSize) {
|
||||
|
@ -433,6 +433,7 @@ DEF_OP(VAddP) {
|
||||
const auto Op = IROp->C<IR::IROp_VAddP>();
|
||||
const auto OpSize = IROp->Size;
|
||||
|
||||
const auto Is256Bit = OpSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
const auto ElementSize = Op->Header.ElementSize;
|
||||
|
||||
const auto Dst = GetDst(Node);
|
||||
@ -478,30 +479,64 @@ DEF_OP(VAddP) {
|
||||
const auto VectorLowerYMM = ToYMM(VectorLower);
|
||||
const auto VectorUpperYMM = ToYMM(VectorUpper);
|
||||
|
||||
// To behave like ADDP, we need to swap the second and third elements around
|
||||
// in the 256-bit case. ADDP operates as if both vectors are concatenated
|
||||
// together and runs down the length of it adding pairs as it goes, whereas
|
||||
// VPHADDW/D operates on both individual halves of the entire register.
|
||||
switch (ElementSize) {
|
||||
case 1:
|
||||
vmovdqu(ymm15, VectorLowerYMM);
|
||||
vmovdqu(ymm14, VectorUpperYMM);
|
||||
if (Is256Bit) {
|
||||
vmovdqu(ymm15, VectorLowerYMM);
|
||||
vmovdqu(ymm14, VectorUpperYMM);
|
||||
|
||||
vpunpcklbw(ymm0, ymm15, ymm14);
|
||||
vpunpckhbw(ymm12, ymm15, ymm14);
|
||||
vpunpcklbw(ymm0, ymm15, ymm14);
|
||||
vpunpckhbw(ymm12, ymm15, ymm14);
|
||||
|
||||
vpunpcklbw(ymm15, ymm0, ymm12);
|
||||
vpunpckhbw(ymm14, ymm0, ymm12);
|
||||
vpunpcklbw(ymm15, ymm0, ymm12);
|
||||
vpunpckhbw(ymm14, ymm0, ymm12);
|
||||
|
||||
vpunpcklbw(ymm0, ymm15, ymm14);
|
||||
vpunpckhbw(ymm12, ymm15, ymm14);
|
||||
vpunpcklbw(ymm0, ymm15, ymm14);
|
||||
vpunpckhbw(ymm12, ymm15, ymm14);
|
||||
|
||||
vpunpcklbw(ymm15, ymm0, ymm12);
|
||||
vpunpckhbw(ymm14, ymm0, ymm12);
|
||||
vpunpcklbw(ymm15, ymm0, ymm12);
|
||||
vpunpckhbw(ymm14, ymm0, ymm12);
|
||||
|
||||
vpaddb(DstYMM, ymm15, ymm14);
|
||||
vpaddb(DstYMM, ymm15, ymm14);
|
||||
vpermq(DstYMM, DstYMM, 0b11'01'10'00);
|
||||
} else {
|
||||
vmovdqu(xmm15, VectorLower);
|
||||
vmovdqu(xmm14, VectorUpper);
|
||||
|
||||
vpunpcklbw(xmm0, xmm15, xmm14);
|
||||
vpunpckhbw(xmm12, xmm15, xmm14);
|
||||
|
||||
vpunpcklbw(xmm15, xmm0, xmm12);
|
||||
vpunpckhbw(xmm14, xmm0, xmm12);
|
||||
|
||||
vpunpcklbw(xmm0, xmm15, xmm14);
|
||||
vpunpckhbw(xmm12, xmm15, xmm14);
|
||||
|
||||
vpunpcklbw(xmm15, xmm0, xmm12);
|
||||
vpunpckhbw(xmm14, xmm0, xmm12);
|
||||
|
||||
vpaddb(Dst, xmm15, xmm14);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
vphaddw(DstYMM, VectorLowerYMM, VectorUpperYMM);
|
||||
if (Is256Bit) {
|
||||
vphaddw(DstYMM, VectorLowerYMM, VectorUpperYMM);
|
||||
vpermq(DstYMM, DstYMM, 0b11'01'10'00);
|
||||
} else {
|
||||
vphaddw(Dst, VectorLower, VectorUpper);
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
vphaddd(DstYMM, VectorLowerYMM, VectorUpperYMM);
|
||||
if (Is256Bit) {
|
||||
vphaddd(DstYMM, VectorLowerYMM, VectorUpperYMM);
|
||||
vpermq(DstYMM, DstYMM, 0b11'01'10'00);
|
||||
} else {
|
||||
vphaddd(Dst, VectorLower, VectorUpper);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);
|
||||
|
@ -5903,8 +5903,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
|
||||
|
||||
{OPD(1, 0b00, 0x77), 1, &OpDispatchBuilder::VZEROOp},
|
||||
|
||||
{OPD(1, 0b01, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<8>},
|
||||
{OPD(1, 0b11, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<4>},
|
||||
{OPD(1, 0b01, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 8>},
|
||||
{OPD(1, 0b11, 0x7C), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 4>},
|
||||
|
||||
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::MOVQOp},
|
||||
@ -5948,6 +5948,9 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
|
||||
{OPD(1, 0b01, 0xFD), 1, &OpDispatchBuilder::AVXVectorALUOp<IR::OP_VADD, 2>},
|
||||
{OPD(1, 0b01, 0xFE), 1, &OpDispatchBuilder::AVXVectorALUOp<IR::OP_VADD, 4>},
|
||||
|
||||
{OPD(2, 0b01, 0x01), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 2>},
|
||||
{OPD(2, 0b01, 0x02), 1, &OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 4>},
|
||||
|
||||
{OPD(2, 0b01, 0x08), 1, &OpDispatchBuilder::VPSIGN<1>},
|
||||
{OPD(2, 0b01, 0x09), 1, &OpDispatchBuilder::VPSIGN<2>},
|
||||
{OPD(2, 0b01, 0x0A), 1, &OpDispatchBuilder::VPSIGN<4>},
|
||||
|
@ -421,7 +421,7 @@ public:
|
||||
template <size_t ElementSize>
|
||||
void VBROADCASTOp(OpcodeArgs);
|
||||
|
||||
template <size_t ElementSize>
|
||||
template <IROps IROp, size_t ElementSize>
|
||||
void VHADDPOp(OpcodeArgs);
|
||||
|
||||
void VINSERTOp(OpcodeArgs);
|
||||
|
@ -1019,7 +1019,7 @@ void OpDispatchBuilder::VANDNOp(OpcodeArgs) {
|
||||
StoreResult(FPRClass, Op, Dest, -1);
|
||||
}
|
||||
|
||||
template <size_t ElementSize>
|
||||
template <IROps IROp, size_t ElementSize>
|
||||
void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
@ -1027,7 +1027,9 @@ void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
|
||||
OrderedNode *Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
|
||||
OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags, -1);
|
||||
|
||||
OrderedNode *Res = _VFAddP(SrcSize, ElementSize, Src1, Src2);
|
||||
auto Res = _VFAddP(SrcSize, ElementSize, Src1, Src2);
|
||||
Res.first->Header.Op = IROp;
|
||||
|
||||
OrderedNode *Dest{};
|
||||
|
||||
if (Is256Bit) {
|
||||
@ -1041,9 +1043,13 @@ void OpDispatchBuilder::VHADDPOp(OpcodeArgs) {
|
||||
}
|
||||
|
||||
template
|
||||
void OpDispatchBuilder::VHADDPOp<4>(OpcodeArgs);
|
||||
void OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 2>(OpcodeArgs);
|
||||
template
|
||||
void OpDispatchBuilder::VHADDPOp<8>(OpcodeArgs);
|
||||
void OpDispatchBuilder::VHADDPOp<IR::OP_VADDP, 4>(OpcodeArgs);
|
||||
template
|
||||
void OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 4>(OpcodeArgs);
|
||||
template
|
||||
void OpDispatchBuilder::VHADDPOp<IR::OP_VFADDP, 8>(OpcodeArgs);
|
||||
|
||||
template <size_t ElementSize>
|
||||
void OpDispatchBuilder::VBROADCASTOp(OpcodeArgs) {
|
||||
|
@ -260,9 +260,9 @@ void InitializeVEXTables() {
|
||||
{OPD(1, 0b01, 0xFE), 1, X86InstInfo{"VPADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
|
||||
// VEX Map 2
|
||||
{OPD(2, 0b01, 0x00), 1, X86InstInfo{"VPSHUFB", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x01), 1, X86InstInfo{"VPADDW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x02), 1, X86InstInfo{"VPHADDD", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x00), 1, X86InstInfo{"VPSHUFB", TYPE_UNDEC, FLAGS_MODRM | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x01), 1, X86InstInfo{"VPHADDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x02), 1, X86InstInfo{"VPHADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x03), 1, X86InstInfo{"VPHADDSW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x04), 1, X86InstInfo{"VPMADDUBSW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
|
||||
{OPD(2, 0b01, 0x05), 1, X86InstInfo{"VPHSUBW", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
|
||||
|
39
unittests/ASM/VEX/vphaddd.asm
Normal file
39
unittests/ASM/VEX/vphaddd.asm
Normal file
@ -0,0 +1,39 @@
|
||||
%ifdef CONFIG
|
||||
{
|
||||
"HostFeatures": ["AVX"],
|
||||
"RegData": {
|
||||
"XMM0": ["0x4142434445464748", "0x5152535455565758", "0x4142434445464748", "0x5152535455565758"],
|
||||
"XMM1": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
|
||||
"XMM2": ["0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM3": ["0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM4": ["0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC"],
|
||||
"XMM5": ["0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C", "0xE6E8EAECC6C8CACC", "0xA6A8AAAC86888A8C"]
|
||||
}
|
||||
}
|
||||
%endif
|
||||
|
||||
lea rdx, [rel .data]
|
||||
|
||||
vmovaps ymm0, [rdx]
|
||||
vmovaps ymm1, [rdx + 32]
|
||||
|
||||
; Memory Operands
|
||||
vphaddd xmm2, xmm0, [rdx + 32]
|
||||
vphaddd xmm3, xmm1, [rdx]
|
||||
|
||||
vphaddd ymm4, ymm0, [rdx + 32]
|
||||
vphaddd ymm5, ymm1, [rdx]
|
||||
|
||||
hlt
|
||||
|
||||
align 32
|
||||
.data:
|
||||
dq 0x4142434445464748
|
||||
dq 0x5152535455565758
|
||||
dq 0x4142434445464748
|
||||
dq 0x5152535455565758
|
||||
|
||||
dq 0x6162636465666768
|
||||
dq 0x7172737475767778
|
||||
dq 0x6162636465666768
|
||||
dq 0x7172737475767778
|
39
unittests/ASM/VEX/vphaddw.asm
Normal file
39
unittests/ASM/VEX/vphaddw.asm
Normal file
@ -0,0 +1,39 @@
|
||||
%ifdef CONFIG
|
||||
{
|
||||
"HostFeatures": ["AVX"],
|
||||
"RegData": {
|
||||
"XMM0": ["0x4142434445464748", "0x5152535455565758", "0x4142434445464748", "0x5152535455565758"],
|
||||
"XMM1": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
|
||||
"XMM2": ["0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM3": ["0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM4": ["0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE"],
|
||||
"XMM5": ["0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E", "0xE4E6ECEEC4C6CCCE", "0xA4A6ACAE84868C8E"]
|
||||
}
|
||||
}
|
||||
%endif
|
||||
|
||||
lea rdx, [rel .data]
|
||||
|
||||
vmovaps ymm0, [rdx]
|
||||
vmovaps ymm1, [rdx + 32]
|
||||
|
||||
; Memory Operands
|
||||
vphaddw xmm2, xmm0, [rdx + 32]
|
||||
vphaddw xmm3, xmm1, [rdx]
|
||||
|
||||
vphaddw ymm4, ymm0, [rdx + 32]
|
||||
vphaddw ymm5, ymm1, [rdx]
|
||||
|
||||
hlt
|
||||
|
||||
align 32
|
||||
.data:
|
||||
dq 0x4142434445464748
|
||||
dq 0x5152535455565758
|
||||
dq 0x4142434445464748
|
||||
dq 0x5152535455565758
|
||||
|
||||
dq 0x6162636465666768
|
||||
dq 0x7172737475767778
|
||||
dq 0x6162636465666768
|
||||
dq 0x7172737475767778
|
Loading…
Reference in New Issue
Block a user