Merge pull request #3730 from Sonicadvance1/avx_4

Vector: Helper refactorings
This commit is contained in:
Ryan Houdek 2024-06-21 00:31:14 -07:00 committed by GitHub
commit 3f232e631e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 139 additions and 108 deletions

View File

@ -609,6 +609,7 @@ public:
void VANDNOp(OpcodeArgs);
Ref VBLENDOpImpl(uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector);
void VBLENDPDOp(OpcodeArgs);
void VPBLENDDOp(OpcodeArgs);
void VPBLENDWOp(OpcodeArgs);
@ -1112,8 +1113,7 @@ private:
Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm);
Ref MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
const X86Tables::DecodedOperand& ImmOp);
Ref MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select);
Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm, bool IsAVX);
@ -1131,9 +1131,9 @@ private:
Ref PINSROpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
const X86Tables::DecodedOperand& Imm);
Ref PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2);
Ref PMADDWDOpImpl(size_t Size, Ref Src1, Ref Src2);
Ref PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);
Ref PMADDUBSWOpImpl(size_t Size, Ref Src1, Ref Src2);
Ref PMULHRSWOpImpl(OpcodeArgs, Ref Src1, Ref Src2);
@ -1141,9 +1141,9 @@ private:
Ref PMULLOpImpl(OpcodeArgs, size_t ElementSize, bool Signed, Ref Src1, Ref Src2);
Ref PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);
Ref PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2);
Ref PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2);
Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2);
Ref PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2);
@ -1155,8 +1155,7 @@ private:
Ref PSRLDOpImpl(OpcodeArgs, size_t ElementSize, Ref Src, Ref ShiftVec);
Ref SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm);
Ref SHUFOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle);
void VMASKMOVOpImpl(OpcodeArgs, size_t ElementSize, size_t DataSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp,
const X86Tables::DecodedOperand& DataOp);

View File

@ -1004,11 +1004,7 @@ template void OpDispatchBuilder::VPUNPCKHOp<2>(OpcodeArgs);
template void OpDispatchBuilder::VPUNPCKHOp<4>(OpcodeArgs);
template void OpDispatchBuilder::VPUNPCKHOp<8>(OpcodeArgs);
Ref OpDispatchBuilder::PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2) {
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
const auto SrcSize = GetSrcSize(Op);
Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
// We perform the 256-bit version as two 128-bit operations due to
@ -1026,25 +1022,33 @@ Ref OpDispatchBuilder::PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
const uint8_t MaskImm = SrcSize == 8 ? 0b1000'0111 : 0b1000'1111;
Ref MaskVector = _VectorImm(SrcSize, 1, MaskImm);
Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2Node, MaskVector);
Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector);
Ref Low = _VTBL1(SanitizedSrcSize, Src1Node, MaskedIndices);
Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices);
if (!Is256Bit) {
return Low;
}
Ref HighSrc1 = _VInsElement(SrcSize, 16, 0, 1, Src1Node, Src1Node);
Ref HighSrc1 = _VInsElement(SrcSize, 16, 0, 1, Src1, Src1);
Ref High = _VTBL1(SanitizedSrcSize, HighSrc1, MaskedIndices);
return _VInsElement(SrcSize, 16, 1, 0, Low, High);
}
void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) {
Ref Result = PSHUFBOpImpl(Op, Op->Dest, Op->Src[0]);
const auto SrcSize = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) {
Ref Result = PSHUFBOpImpl(Op, Op->Src[0], Op->Src[1]);
const auto SrcSize = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
@ -1259,32 +1263,24 @@ template void OpDispatchBuilder::VPSHUFWOp<2, false>(OpcodeArgs);
template void OpDispatchBuilder::VPSHUFWOp<2, true>(OpcodeArgs);
template void OpDispatchBuilder::VPSHUFWOp<4, true>(OpcodeArgs);
Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1,
const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm) {
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
LOGMAN_THROW_A_FMT(Imm.IsLiteral(), "Imm needs to be a literal");
uint8_t Shuffle = Imm.Data.Literal.Value;
Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle) {
// Since 256-bit variants and up don't lane cross, we can construct
// everything in terms of the 128-variant, as each lane is essentially
// its own 128-bit segment.
const uint8_t NumElements = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
const uint8_t HalfNumElements = NumElements >> 1;
const uint8_t DstSize = GetDstSize(Op);
const bool Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
std::array<Ref, 4> Srcs {};
for (size_t i = 0; i < HalfNumElements; ++i) {
Srcs[i] = Src1Node;
Srcs[i] = Src1;
}
for (size_t i = HalfNumElements; i < NumElements; ++i) {
Srcs[i] = Src2Node;
Srcs[i] = Src2;
}
Ref Dest = Src1Node;
Ref Dest = Src1;
const uint8_t SelectionMask = NumElements - 1;
const uint8_t ShiftAmount = std::popcount(SelectionMask);
@ -1311,96 +1307,96 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
// Combining of low 64-bits.
// Dest[63:0] = Src1[63:0]
// Dest[127:64] = Src2[63:0]
return _VZip(DstSize, 8, Src1Node, Src2Node);
return _VZip(DstSize, 8, Src1, Src2);
case 0b11'10'11'10:
// Combining of high 64-bits.
// Dest[63:0] = Src1[127:64]
// Dest[127:64] = Src2[127:64]
return _VZip2(DstSize, 8, Src1Node, Src2Node);
return _VZip2(DstSize, 8, Src1, Src2);
case 0b11'10'01'00:
// Mixing Low and high elements
// Dest[63:0] = Src1[63:0]
// Dest[127:64] = Src2[127:64]
return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
case 0b01'00'11'10:
// Mixing Low and high elements, inverse of above
// Dest[63:0] = Src1[127:64]
// Dest[127:64] = Src2[63:0]
return _VExtr(DstSize, 1, Src2Node, Src1Node, 8);
return _VExtr(DstSize, 1, Src2, Src1, 8);
case 0b10'00'10'00:
// Mixing even elements.
// Dest[31:0] = Src1[31:0]
// Dest[63:32] = Src1[95:64]
// Dest[95:64] = Src2[31:0]
// Dest[127:96] = Src2[95:64]
return _VUnZip(DstSize, ElementSize, Src1Node, Src2Node);
return _VUnZip(DstSize, ElementSize, Src1, Src2);
case 0b11'01'11'01:
// Mixing odd elements.
// Dest[31:0] = Src1[63:32]
// Dest[63:32] = Src1[127:96]
// Dest[95:64] = Src2[63:32]
// Dest[127:96] = Src2[127:96]
return _VUnZip2(DstSize, ElementSize, Src1Node, Src2Node);
return _VUnZip2(DstSize, ElementSize, Src1, Src2);
case 0b11'10'00'00:
case 0b11'10'01'01:
case 0b11'10'10'10:
case 0b11'10'11'11: {
// Bottom elements duplicated, Top 64-bits inserted
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
return _VZip2(DstSize, 8, DupSrc1, Src2Node);
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
return _VZip2(DstSize, 8, DupSrc1, Src2);
}
case 0b01'00'00'00:
case 0b01'00'01'01:
case 0b01'00'10'10:
case 0b01'00'11'11: {
// Bottom elements duplicated, Bottom 64-bits inserted
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
return _VZip(DstSize, 8, DupSrc1, Src2Node);
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
return _VZip(DstSize, 8, DupSrc1, Src2);
}
case 0b00'00'01'00:
case 0b01'01'01'00:
case 0b10'10'01'00:
case 0b11'11'01'00: {
// Top elements duplicated, Bottom 64-bits inserted
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
return _VZip(DstSize, 8, Src1Node, DupSrc2);
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
return _VZip(DstSize, 8, Src1, DupSrc2);
}
case 0b00'00'11'10:
case 0b01'01'11'10:
case 0b10'10'11'10:
case 0b11'11'11'10: {
// Top elements duplicated, Top 64-bits inserted
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
return _VZip2(DstSize, 8, Src1Node, DupSrc2);
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
return _VZip2(DstSize, 8, Src1, DupSrc2);
}
case 0b01'00'01'11: {
// TODO: This doesn't generate optimal code.
// RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
// RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
// With RA fixes this would be 2 instructions.
// Odd elements inverted, Low 64-bits inserted
Src1Node = _VInsElement(DstSize, 4, 0, 3, Src1Node, Src1Node);
return _VZip(DstSize, 8, Src1Node, Src2Node);
Src1 = _VInsElement(DstSize, 4, 0, 3, Src1, Src1);
return _VZip(DstSize, 8, Src1, Src2);
}
case 0b11'10'01'11: {
// TODO: This doesn't generate optimal code.
// RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
// RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
// With RA fixes this would be 2 instructions.
// Odd elements inverted, Top 64-bits inserted
Src1Node = _VInsElement(DstSize, 4, 0, 3, Src1Node, Src1Node);
return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
Src1 = _VInsElement(DstSize, 4, 0, 3, Src1, Src1);
return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
}
case 0b01'00'00'01: {
// Lower 32-bit elements inverted, low 64-bits inserted
Src1Node = _VRev64(DstSize, 4, Src1Node);
return _VZip(DstSize, 8, Src1Node, Src2Node);
Src1 = _VRev64(DstSize, 4, Src1);
return _VZip(DstSize, 8, Src1, Src2);
}
case 0b11'10'00'01: {
// TODO: This doesn't generate optimal code.
// RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
// RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
// With RA fixes this would be 2 instructions.
// Lower 32-bit elements inverted, Top 64-bits inserted
Src1Node = _VRev64(DstSize, 4, Src1Node);
return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
Src1 = _VRev64(DstSize, 4, Src1);
return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
}
case 0b00'00'00'00:
case 0b00'00'01'01:
@ -1419,8 +1415,8 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
case 0b11'11'10'10:
case 0b11'11'11'11: {
// Duplicate element in upper and lower across each 64-bit segment.
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
return _VZip(DstSize, 8, DupSrc1, DupSrc2);
}
default:
@ -1428,7 +1424,7 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
if (CTX->BackendFeatures.SupportsVTBL2) {
auto LookupIndexes =
LoadAndCacheIndexedNamedVectorConstant(DstSize, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_SHUFPS, Shuffle * 16);
return _VTBL2(DstSize, Src1Node, Src2Node, LookupIndexes);
return _VTBL2(DstSize, Src1, Src2, LookupIndexes);
}
break;
}
@ -1436,18 +1432,18 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
switch (Shuffle & 0b11) {
case 0b00:
// Low 64-bits of each source interleaved.
return _VZip(DstSize, ElementSize, Src1Node, Src2Node);
return _VZip(DstSize, ElementSize, Src1, Src2);
case 0b01:
// Upper 64-bits of Src1 in lower bits
// Lower 64-bits of Src2 in upper bits.
return _VExtr(DstSize, 1, Src2Node, Src1Node, 8);
return _VExtr(DstSize, 1, Src2, Src1, 8);
case 0b10:
// Lower 32-bits of Src1 in lower bits.
// Upper 64-bits of Src2 in upper bits.
return _VInsElement(DstSize, ElementSize, 1, 1, Src1Node, Src2Node);
return _VInsElement(DstSize, ElementSize, 1, 1, Src1, Src2);
case 0b11:
// Upper 64-bits of each source interleaved.
return _VZip2(DstSize, ElementSize, Src1Node, Src2Node);
return _VZip2(DstSize, ElementSize, Src1, Src2);
}
}
@ -1463,7 +1459,13 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
template<size_t ElementSize>
void OpDispatchBuilder::SHUFOp(OpcodeArgs) {
Ref Result = SHUFOpImpl(Op, ElementSize, Op->Dest, Op->Src[0], Op->Src[1]);
Ref Src1Node = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2Node = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Imm needs to be a literal");
uint8_t Shuffle = Op->Src[1].Data.Literal.Value;
Ref Result = SHUFOpImpl(Op, GetDstSize(Op), ElementSize, Src1Node, Src2Node, Shuffle);
StoreResult(FPRClass, Op, Result, -1);
}
template void OpDispatchBuilder::SHUFOp<4>(OpcodeArgs);
@ -1471,7 +1473,13 @@ template void OpDispatchBuilder::SHUFOp<8>(OpcodeArgs);
template<size_t ElementSize>
void OpDispatchBuilder::VSHUFOp(OpcodeArgs) {
Ref Result = SHUFOpImpl(Op, ElementSize, Op->Src[0], Op->Src[1], Op->Src[2]);
Ref Src1Node = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2Node = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "Imm needs to be a literal");
uint8_t Shuffle = Op->Src[2].Data.Literal.Value;
Ref Result = SHUFOpImpl(Op, GetDstSize(Op), ElementSize, Src1Node, Src2Node, Shuffle);
StoreResult(FPRClass, Op, Result, -1);
}
template void OpDispatchBuilder::VSHUFOp<4>(OpcodeArgs);
@ -3361,7 +3369,7 @@ template void OpDispatchBuilder::VPFCMPOp<0>(OpcodeArgs);
template void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs);
template void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs);
Ref OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2) {
Ref OpDispatchBuilder::PMADDWDOpImpl(size_t Size, Ref Src1, Ref Src2) {
// This is a pretty curious operation
// Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination
//
@ -3370,42 +3378,42 @@ Ref OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
// xmm1[63:32] = (xmm1[47:32] * xmm2[47:32]) + (xmm1[63:48] * xmm2[63:48])
// etc.. for larger registers
auto Size = GetSrcSize(Op);
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
if (Size == 8) {
if (Size == OpSize::i64Bit) {
// MMX implementation can be slightly more optimal
Size <<= 1;
auto MullResult = _VSMull(Size, 2, Src1Node, Src2Node);
auto MullResult = _VSMull(Size, 2, Src1, Src2);
return _VAddP(Size, 4, MullResult, MullResult);
}
auto Lower = _VSMull(Size, 2, Src1Node, Src2Node);
auto Upper = _VSMull2(Size, 2, Src1Node, Src2Node);
auto Lower = _VSMull(Size, 2, Src1, Src2);
auto Upper = _VSMull2(Size, 2, Src1, Src2);
// [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112]
return _VAddP(Size, 4, Lower, Upper);
}
void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
Ref Result = PMADDWDOpImpl(Op, Op->Dest, Op->Src[0]);
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Result = PMADDWDOpImpl(Size, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) {
Ref Result = PMADDWDOpImpl(Op, Op->Src[0], Op->Src[1]);
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result = PMADDWDOpImpl(Size, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
Ref OpDispatchBuilder::PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) {
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
if (Size == 8) {
Ref OpDispatchBuilder::PMADDUBSWOpImpl(size_t Size, Ref Src1, Ref Src2) {
if (Size == OpSize::i64Bit) {
// 64bit is more efficient
// Src1 is unsigned
@ -3446,12 +3454,22 @@ Ref OpDispatchBuilder::PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOpera
}
void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) {
Ref Result = PMADDUBSWOpImpl(Op, Op->Dest, Op->Src[0]);
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPMADDUBSWOp(OpcodeArgs) {
Ref Result = PMADDUBSWOpImpl(Op, Op->Src[0], Op->Src[1]);
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
@ -3683,19 +3701,15 @@ void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) {
StoreResult(FPRClass, Op, Dest, -1);
}
Ref OpDispatchBuilder::PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) {
Ref OpDispatchBuilder::PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2) {
// The documentation is actually incorrect in how this instruction operates
// It strongly implies that the `abs(dest[i] - src[i])` operates in 8bit space
// but it actually operates in more than 8bit space
// This can be seen with `abs(0 - 0xFF)` returning a different result depending
// on bit length
const auto Size = GetSrcSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
if (Size == 8) {
if (Size == OpSize::i64Bit) {
auto AbsResult = _VUABDL(Size * 2, 1, Src1, Src2);
// Now vector-wide add the results for each
@ -3727,12 +3741,22 @@ Ref OpDispatchBuilder::PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
}
void OpDispatchBuilder::PSADBW(OpcodeArgs) {
Ref Result = PSADBWOpImpl(Op, Op->Dest, Op->Src[0]);
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Result = PSADBWOpImpl(Size, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VPSADBWOp(OpcodeArgs) {
Ref Result = PSADBWOpImpl(Op, Op->Src[0], Op->Src[1]);
const auto Size = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result = PSADBWOpImpl(Size, Src1, Src2);
StoreResult(FPRClass, Op, Result, -1);
}
@ -4461,8 +4485,7 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
template void OpDispatchBuilder::VDPPOp<4>(OpcodeArgs);
template void OpDispatchBuilder::VDPPOp<8>(OpcodeArgs);
Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
const X86Tables::DecodedOperand& ImmOp) {
Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) {
// Src2 will grab a 32bit element and duplicate it across the 128bits
Ref DupSrc = _VDupElement(16, 4, Src2, Selector_Src2);
@ -4529,18 +4552,12 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
return _VAddP(16, 2, TmpTranspose1, TmpTranspose2);
};
LOGMAN_THROW_A_FMT(ImmOp.IsLiteral(), "ImmOp needs to be literal here");
const uint8_t Select = ImmOp.Data.Literal.Value;
const uint8_t SrcSize = GetSrcSize(Op);
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
// Src1 needs to be in byte offset
const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8;
const uint8_t Select_Src2_Low = Select & 0b11;
Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
Ref Lower = LaneHelper(Select_Src1_Low, Select_Src2_Low, Src1, Src2);
if (Is128Bit) {
return Lower;
@ -4556,12 +4573,27 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
}
void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
Ref Result = MPSADBWOpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1]);
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "ImmOp needs to be literal here");
const uint8_t Select = Op->Src[1].Data.Literal.Value;
const uint8_t SrcSize = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select);
StoreResult(FPRClass, Op, Result, -1);
}
void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) {
Ref Result = MPSADBWOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "ImmOp needs to be literal here");
const uint8_t Select = Op->Src[2].Data.Literal.Value;
const uint8_t SrcSize = GetSrcSize(Op);
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select);
StoreResult(FPRClass, Op, Result, -1);
}
@ -4707,7 +4739,7 @@ void OpDispatchBuilder::VPERMQOp(OpcodeArgs) {
StoreResult(FPRClass, Op, Result, -1);
}
static Ref VBLENDOpImpl(IREmitter& IR, uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) {
Ref OpDispatchBuilder::VBLENDOpImpl(uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) {
const std::array Sources {Src1, Src2};
Ref Result = ZeroRegister;
@ -4715,7 +4747,7 @@ static Ref VBLENDOpImpl(IREmitter& IR, uint32_t VecSize, uint32_t ElementSize, R
for (int i = 0; i < NumElements; i++) {
const auto SelectorIndex = (Selector >> i) & 1;
Result = IR._VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]);
Result = _VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]);
}
return Result;
@ -4744,7 +4776,7 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
}
const auto ZeroRegister = LoadZeroVector(DstSize);
Ref Result = VBLENDOpImpl(*this, DstSize, 8, Src1, Src2, ZeroRegister, Selector);
Ref Result = VBLENDOpImpl(DstSize, 8, Src1, Src2, ZeroRegister, Selector);
StoreResult(FPRClass, Op, Result, -1);
}
@ -4787,7 +4819,7 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
}
const auto ZeroRegister = LoadZeroVector(DstSize);
Ref Result = VBLENDOpImpl(*this, DstSize, 4, Src1, Src2, ZeroRegister, Selector);
Ref Result = VBLENDOpImpl(DstSize, 4, Src1, Src2, ZeroRegister, Selector);
if (!Is256Bit) {
Result = _VMov(16, Result);
}
@ -4821,7 +4853,7 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
const auto NewSelector = Selector << 8 | Selector;
const auto ZeroRegister = LoadZeroVector(DstSize);
Ref Result = VBLENDOpImpl(*this, DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
Ref Result = VBLENDOpImpl(DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
if (Is128Bit) {
Result = _VMov(16, Result);
}