mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-02-08 23:57:05 +00:00
Merge pull request #3730 from Sonicadvance1/avx_4
Vector: Helper refactorings
This commit is contained in:
commit
3f232e631e
@ -609,6 +609,7 @@ public:
|
||||
|
||||
void VANDNOp(OpcodeArgs);
|
||||
|
||||
Ref VBLENDOpImpl(uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector);
|
||||
void VBLENDPDOp(OpcodeArgs);
|
||||
void VPBLENDDOp(OpcodeArgs);
|
||||
void VPBLENDWOp(OpcodeArgs);
|
||||
@ -1112,8 +1113,7 @@ private:
|
||||
Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
|
||||
const X86Tables::DecodedOperand& Imm);
|
||||
|
||||
Ref MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
|
||||
const X86Tables::DecodedOperand& ImmOp);
|
||||
Ref MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select);
|
||||
|
||||
Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
|
||||
const X86Tables::DecodedOperand& Imm, bool IsAVX);
|
||||
@ -1131,9 +1131,9 @@ private:
|
||||
Ref PINSROpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
|
||||
const X86Tables::DecodedOperand& Imm);
|
||||
|
||||
Ref PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2);
|
||||
Ref PMADDWDOpImpl(size_t Size, Ref Src1, Ref Src2);
|
||||
|
||||
Ref PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);
|
||||
Ref PMADDUBSWOpImpl(size_t Size, Ref Src1, Ref Src2);
|
||||
|
||||
Ref PMULHRSWOpImpl(OpcodeArgs, Ref Src1, Ref Src2);
|
||||
|
||||
@ -1141,9 +1141,9 @@ private:
|
||||
|
||||
Ref PMULLOpImpl(OpcodeArgs, size_t ElementSize, bool Signed, Ref Src1, Ref Src2);
|
||||
|
||||
Ref PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op);
|
||||
Ref PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2);
|
||||
|
||||
Ref PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2);
|
||||
Ref PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2);
|
||||
|
||||
Ref PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2);
|
||||
|
||||
@ -1155,8 +1155,7 @@ private:
|
||||
|
||||
Ref PSRLDOpImpl(OpcodeArgs, size_t ElementSize, Ref Src, Ref ShiftVec);
|
||||
|
||||
Ref SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2,
|
||||
const X86Tables::DecodedOperand& Imm);
|
||||
Ref SHUFOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle);
|
||||
|
||||
void VMASKMOVOpImpl(OpcodeArgs, size_t ElementSize, size_t DataSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp,
|
||||
const X86Tables::DecodedOperand& DataOp);
|
||||
|
@ -1004,11 +1004,7 @@ template void OpDispatchBuilder::VPUNPCKHOp<2>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPUNPCKHOp<4>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPUNPCKHOp<8>(OpcodeArgs);
|
||||
|
||||
Ref OpDispatchBuilder::PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2) {
|
||||
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
|
||||
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
|
||||
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
Ref OpDispatchBuilder::PSHUFBOpImpl(uint8_t SrcSize, Ref Src1, Ref Src2) {
|
||||
const auto Is256Bit = SrcSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
|
||||
// We perform the 256-bit version as two 128-bit operations due to
|
||||
@ -1026,25 +1022,33 @@ Ref OpDispatchBuilder::PSHUFBOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
|
||||
const uint8_t MaskImm = SrcSize == 8 ? 0b1000'0111 : 0b1000'1111;
|
||||
|
||||
Ref MaskVector = _VectorImm(SrcSize, 1, MaskImm);
|
||||
Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2Node, MaskVector);
|
||||
Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector);
|
||||
|
||||
Ref Low = _VTBL1(SanitizedSrcSize, Src1Node, MaskedIndices);
|
||||
Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices);
|
||||
if (!Is256Bit) {
|
||||
return Low;
|
||||
}
|
||||
|
||||
Ref HighSrc1 = _VInsElement(SrcSize, 16, 0, 1, Src1Node, Src1Node);
|
||||
Ref HighSrc1 = _VInsElement(SrcSize, 16, 0, 1, Src1, Src1);
|
||||
Ref High = _VTBL1(SanitizedSrcSize, HighSrc1, MaskedIndices);
|
||||
return _VInsElement(SrcSize, 16, 1, 0, Low, High);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) {
|
||||
Ref Result = PSHUFBOpImpl(Op, Op->Dest, Op->Src[0]);
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) {
|
||||
Ref Result = PSHUFBOpImpl(Op, Op->Src[0], Op->Src[1]);
|
||||
const auto SrcSize = GetSrcSize(Op);
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
@ -1259,32 +1263,24 @@ template void OpDispatchBuilder::VPSHUFWOp<2, false>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPSHUFWOp<2, true>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPSHUFWOp<4, true>(OpcodeArgs);
|
||||
|
||||
Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Tables::DecodedOperand& Src1,
|
||||
const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm) {
|
||||
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
|
||||
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
|
||||
|
||||
LOGMAN_THROW_A_FMT(Imm.IsLiteral(), "Imm needs to be a literal");
|
||||
uint8_t Shuffle = Imm.Data.Literal.Value;
|
||||
|
||||
Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t DstSize, size_t ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle) {
|
||||
// Since 256-bit variants and up don't lane cross, we can construct
|
||||
// everything in terms of the 128-variant, as each lane is essentially
|
||||
// its own 128-bit segment.
|
||||
const uint8_t NumElements = Core::CPUState::XMM_SSE_REG_SIZE / ElementSize;
|
||||
const uint8_t HalfNumElements = NumElements >> 1;
|
||||
|
||||
const uint8_t DstSize = GetDstSize(Op);
|
||||
const bool Is256Bit = DstSize == Core::CPUState::XMM_AVX_REG_SIZE;
|
||||
|
||||
std::array<Ref, 4> Srcs {};
|
||||
for (size_t i = 0; i < HalfNumElements; ++i) {
|
||||
Srcs[i] = Src1Node;
|
||||
Srcs[i] = Src1;
|
||||
}
|
||||
for (size_t i = HalfNumElements; i < NumElements; ++i) {
|
||||
Srcs[i] = Src2Node;
|
||||
Srcs[i] = Src2;
|
||||
}
|
||||
|
||||
Ref Dest = Src1Node;
|
||||
Ref Dest = Src1;
|
||||
const uint8_t SelectionMask = NumElements - 1;
|
||||
const uint8_t ShiftAmount = std::popcount(SelectionMask);
|
||||
|
||||
@ -1311,96 +1307,96 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
|
||||
// Combining of low 64-bits.
|
||||
// Dest[63:0] = Src1[63:0]
|
||||
// Dest[127:64] = Src2[63:0]
|
||||
return _VZip(DstSize, 8, Src1Node, Src2Node);
|
||||
return _VZip(DstSize, 8, Src1, Src2);
|
||||
case 0b11'10'11'10:
|
||||
// Combining of high 64-bits.
|
||||
// Dest[63:0] = Src1[127:64]
|
||||
// Dest[127:64] = Src2[127:64]
|
||||
return _VZip2(DstSize, 8, Src1Node, Src2Node);
|
||||
return _VZip2(DstSize, 8, Src1, Src2);
|
||||
case 0b11'10'01'00:
|
||||
// Mixing Low and high elements
|
||||
// Dest[63:0] = Src1[63:0]
|
||||
// Dest[127:64] = Src2[127:64]
|
||||
return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
|
||||
return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
|
||||
case 0b01'00'11'10:
|
||||
// Mixing Low and high elements, inverse of above
|
||||
// Dest[63:0] = Src1[127:64]
|
||||
// Dest[127:64] = Src2[63:0]
|
||||
return _VExtr(DstSize, 1, Src2Node, Src1Node, 8);
|
||||
return _VExtr(DstSize, 1, Src2, Src1, 8);
|
||||
case 0b10'00'10'00:
|
||||
// Mixing even elements.
|
||||
// Dest[31:0] = Src1[31:0]
|
||||
// Dest[63:32] = Src1[95:64]
|
||||
// Dest[95:64] = Src2[31:0]
|
||||
// Dest[127:96] = Src2[95:64]
|
||||
return _VUnZip(DstSize, ElementSize, Src1Node, Src2Node);
|
||||
return _VUnZip(DstSize, ElementSize, Src1, Src2);
|
||||
case 0b11'01'11'01:
|
||||
// Mixing odd elements.
|
||||
// Dest[31:0] = Src1[63:32]
|
||||
// Dest[63:32] = Src1[127:96]
|
||||
// Dest[95:64] = Src2[63:32]
|
||||
// Dest[127:96] = Src2[127:96]
|
||||
return _VUnZip2(DstSize, ElementSize, Src1Node, Src2Node);
|
||||
return _VUnZip2(DstSize, ElementSize, Src1, Src2);
|
||||
case 0b11'10'00'00:
|
||||
case 0b11'10'01'01:
|
||||
case 0b11'10'10'10:
|
||||
case 0b11'10'11'11: {
|
||||
// Bottom elements duplicated, Top 64-bits inserted
|
||||
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
|
||||
return _VZip2(DstSize, 8, DupSrc1, Src2Node);
|
||||
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
|
||||
return _VZip2(DstSize, 8, DupSrc1, Src2);
|
||||
}
|
||||
case 0b01'00'00'00:
|
||||
case 0b01'00'01'01:
|
||||
case 0b01'00'10'10:
|
||||
case 0b01'00'11'11: {
|
||||
// Bottom elements duplicated, Bottom 64-bits inserted
|
||||
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
|
||||
return _VZip(DstSize, 8, DupSrc1, Src2Node);
|
||||
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
|
||||
return _VZip(DstSize, 8, DupSrc1, Src2);
|
||||
}
|
||||
case 0b00'00'01'00:
|
||||
case 0b01'01'01'00:
|
||||
case 0b10'10'01'00:
|
||||
case 0b11'11'01'00: {
|
||||
// Top elements duplicated, Bottom 64-bits inserted
|
||||
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
|
||||
return _VZip(DstSize, 8, Src1Node, DupSrc2);
|
||||
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
|
||||
return _VZip(DstSize, 8, Src1, DupSrc2);
|
||||
}
|
||||
case 0b00'00'11'10:
|
||||
case 0b01'01'11'10:
|
||||
case 0b10'10'11'10:
|
||||
case 0b11'11'11'10: {
|
||||
// Top elements duplicated, Top 64-bits inserted
|
||||
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
|
||||
return _VZip2(DstSize, 8, Src1Node, DupSrc2);
|
||||
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
|
||||
return _VZip2(DstSize, 8, Src1, DupSrc2);
|
||||
}
|
||||
case 0b01'00'01'11: {
|
||||
// TODO: This doesn't generate optimal code.
|
||||
// RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
|
||||
// RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
|
||||
// With RA fixes this would be 2 instructions.
|
||||
// Odd elements inverted, Low 64-bits inserted
|
||||
Src1Node = _VInsElement(DstSize, 4, 0, 3, Src1Node, Src1Node);
|
||||
return _VZip(DstSize, 8, Src1Node, Src2Node);
|
||||
Src1 = _VInsElement(DstSize, 4, 0, 3, Src1, Src1);
|
||||
return _VZip(DstSize, 8, Src1, Src2);
|
||||
}
|
||||
case 0b11'10'01'11: {
|
||||
// TODO: This doesn't generate optimal code.
|
||||
// RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
|
||||
// RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
|
||||
// With RA fixes this would be 2 instructions.
|
||||
// Odd elements inverted, Top 64-bits inserted
|
||||
Src1Node = _VInsElement(DstSize, 4, 0, 3, Src1Node, Src1Node);
|
||||
return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
|
||||
Src1 = _VInsElement(DstSize, 4, 0, 3, Src1, Src1);
|
||||
return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
|
||||
}
|
||||
case 0b01'00'00'01: {
|
||||
// Lower 32-bit elements inverted, low 64-bits inserted
|
||||
Src1Node = _VRev64(DstSize, 4, Src1Node);
|
||||
return _VZip(DstSize, 8, Src1Node, Src2Node);
|
||||
Src1 = _VRev64(DstSize, 4, Src1);
|
||||
return _VZip(DstSize, 8, Src1, Src2);
|
||||
}
|
||||
case 0b11'10'00'01: {
|
||||
// TODO: This doesn't generate optimal code.
|
||||
// RA doesn't understand that Src1Node is dead after VInsElement due to SRA class differences.
|
||||
// RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences.
|
||||
// With RA fixes this would be 2 instructions.
|
||||
// Lower 32-bit elements inverted, Top 64-bits inserted
|
||||
Src1Node = _VRev64(DstSize, 4, Src1Node);
|
||||
return _VInsElement(DstSize, 8, 1, 1, Src1Node, Src2Node);
|
||||
Src1 = _VRev64(DstSize, 4, Src1);
|
||||
return _VInsElement(DstSize, 8, 1, 1, Src1, Src2);
|
||||
}
|
||||
case 0b00'00'00'00:
|
||||
case 0b00'00'01'01:
|
||||
@ -1419,8 +1415,8 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
|
||||
case 0b11'11'10'10:
|
||||
case 0b11'11'11'11: {
|
||||
// Duplicate element in upper and lower across each 64-bit segment.
|
||||
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1Node, Shuffle & 0b11);
|
||||
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2Node, (Shuffle >> 4) & 0b11);
|
||||
auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11);
|
||||
auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11);
|
||||
return _VZip(DstSize, 8, DupSrc1, DupSrc2);
|
||||
}
|
||||
default:
|
||||
@ -1428,7 +1424,7 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
|
||||
if (CTX->BackendFeatures.SupportsVTBL2) {
|
||||
auto LookupIndexes =
|
||||
LoadAndCacheIndexedNamedVectorConstant(DstSize, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_SHUFPS, Shuffle * 16);
|
||||
return _VTBL2(DstSize, Src1Node, Src2Node, LookupIndexes);
|
||||
return _VTBL2(DstSize, Src1, Src2, LookupIndexes);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -1436,18 +1432,18 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
|
||||
switch (Shuffle & 0b11) {
|
||||
case 0b00:
|
||||
// Low 64-bits of each source interleaved.
|
||||
return _VZip(DstSize, ElementSize, Src1Node, Src2Node);
|
||||
return _VZip(DstSize, ElementSize, Src1, Src2);
|
||||
case 0b01:
|
||||
// Upper 64-bits of Src1 in lower bits
|
||||
// Lower 64-bits of Src2 in upper bits.
|
||||
return _VExtr(DstSize, 1, Src2Node, Src1Node, 8);
|
||||
return _VExtr(DstSize, 1, Src2, Src1, 8);
|
||||
case 0b10:
|
||||
// Lower 32-bits of Src1 in lower bits.
|
||||
// Upper 64-bits of Src2 in upper bits.
|
||||
return _VInsElement(DstSize, ElementSize, 1, 1, Src1Node, Src2Node);
|
||||
return _VInsElement(DstSize, ElementSize, 1, 1, Src1, Src2);
|
||||
case 0b11:
|
||||
// Upper 64-bits of each source interleaved.
|
||||
return _VZip2(DstSize, ElementSize, Src1Node, Src2Node);
|
||||
return _VZip2(DstSize, ElementSize, Src1, Src2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1463,7 +1459,13 @@ Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, size_t ElementSize, const X86Table
|
||||
|
||||
template<size_t ElementSize>
|
||||
void OpDispatchBuilder::SHUFOp(OpcodeArgs) {
|
||||
Ref Result = SHUFOpImpl(Op, ElementSize, Op->Dest, Op->Src[0], Op->Src[1]);
|
||||
Ref Src1Node = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2Node = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Imm needs to be a literal");
|
||||
uint8_t Shuffle = Op->Src[1].Data.Literal.Value;
|
||||
|
||||
Ref Result = SHUFOpImpl(Op, GetDstSize(Op), ElementSize, Src1Node, Src2Node, Shuffle);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
template void OpDispatchBuilder::SHUFOp<4>(OpcodeArgs);
|
||||
@ -1471,7 +1473,13 @@ template void OpDispatchBuilder::SHUFOp<8>(OpcodeArgs);
|
||||
|
||||
template<size_t ElementSize>
|
||||
void OpDispatchBuilder::VSHUFOp(OpcodeArgs) {
|
||||
Ref Result = SHUFOpImpl(Op, ElementSize, Op->Src[0], Op->Src[1], Op->Src[2]);
|
||||
Ref Src1Node = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2Node = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "Imm needs to be a literal");
|
||||
uint8_t Shuffle = Op->Src[2].Data.Literal.Value;
|
||||
|
||||
Ref Result = SHUFOpImpl(Op, GetDstSize(Op), ElementSize, Src1Node, Src2Node, Shuffle);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
template void OpDispatchBuilder::VSHUFOp<4>(OpcodeArgs);
|
||||
@ -3361,7 +3369,7 @@ template void OpDispatchBuilder::VPFCMPOp<0>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs);
|
||||
|
||||
Ref OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2) {
|
||||
Ref OpDispatchBuilder::PMADDWDOpImpl(size_t Size, Ref Src1, Ref Src2) {
|
||||
// This is a pretty curious operation
|
||||
// Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination
|
||||
//
|
||||
@ -3370,42 +3378,42 @@ Ref OpDispatchBuilder::PMADDWDOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
|
||||
// xmm1[63:32] = (xmm1[47:32] * xmm2[47:32]) + (xmm1[63:48] * xmm2[63:48])
|
||||
// etc.. for larger registers
|
||||
|
||||
auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1Node = LoadSource(FPRClass, Op, Src1, Op->Flags);
|
||||
Ref Src2Node = LoadSource(FPRClass, Op, Src2, Op->Flags);
|
||||
|
||||
if (Size == 8) {
|
||||
if (Size == OpSize::i64Bit) {
|
||||
// MMX implementation can be slightly more optimal
|
||||
Size <<= 1;
|
||||
auto MullResult = _VSMull(Size, 2, Src1Node, Src2Node);
|
||||
auto MullResult = _VSMull(Size, 2, Src1, Src2);
|
||||
return _VAddP(Size, 4, MullResult, MullResult);
|
||||
}
|
||||
|
||||
auto Lower = _VSMull(Size, 2, Src1Node, Src2Node);
|
||||
auto Upper = _VSMull2(Size, 2, Src1Node, Src2Node);
|
||||
auto Lower = _VSMull(Size, 2, Src1, Src2);
|
||||
auto Upper = _VSMull2(Size, 2, Src1, Src2);
|
||||
|
||||
// [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112]
|
||||
return _VAddP(Size, 4, Lower, Upper);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PMADDWD(OpcodeArgs) {
|
||||
Ref Result = PMADDWDOpImpl(Op, Op->Dest, Op->Src[0]);
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
Ref Result = PMADDWDOpImpl(Size, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) {
|
||||
Ref Result = PMADDWDOpImpl(Op, Op->Src[0], Op->Src[1]);
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Ref Result = PMADDWDOpImpl(Size, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
Ref OpDispatchBuilder::PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) {
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
|
||||
|
||||
if (Size == 8) {
|
||||
Ref OpDispatchBuilder::PMADDUBSWOpImpl(size_t Size, Ref Src1, Ref Src2) {
|
||||
if (Size == OpSize::i64Bit) {
|
||||
// 64bit is more efficient
|
||||
|
||||
// Src1 is unsigned
|
||||
@ -3446,12 +3454,22 @@ Ref OpDispatchBuilder::PMADDUBSWOpImpl(OpcodeArgs, const X86Tables::DecodedOpera
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) {
|
||||
Ref Result = PMADDUBSWOpImpl(Op, Op->Dest, Op->Src[0]);
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::VPMADDUBSWOp(OpcodeArgs) {
|
||||
Ref Result = PMADDUBSWOpImpl(Op, Op->Src[0], Op->Src[1]);
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
@ -3683,19 +3701,15 @@ void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) {
|
||||
StoreResult(FPRClass, Op, Dest, -1);
|
||||
}
|
||||
|
||||
Ref OpDispatchBuilder::PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) {
|
||||
Ref OpDispatchBuilder::PSADBWOpImpl(size_t Size, Ref Src1, Ref Src2) {
|
||||
// The documentation is actually incorrect in how this instruction operates
|
||||
// It strongly implies that the `abs(dest[i] - src[i])` operates in 8bit space
|
||||
// but it actually operates in more than 8bit space
|
||||
// This can be seen with `abs(0 - 0xFF)` returning a different result depending
|
||||
// on bit length
|
||||
const auto Size = GetSrcSize(Op);
|
||||
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
|
||||
|
||||
if (Size == 8) {
|
||||
if (Size == OpSize::i64Bit) {
|
||||
auto AbsResult = _VUABDL(Size * 2, 1, Src1, Src2);
|
||||
|
||||
// Now vector-wide add the results for each
|
||||
@ -3727,12 +3741,22 @@ Ref OpDispatchBuilder::PSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand&
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PSADBW(OpcodeArgs) {
|
||||
Ref Result = PSADBWOpImpl(Op, Op->Dest, Op->Src[0]);
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
Ref Result = PSADBWOpImpl(Size, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::VPSADBWOp(OpcodeArgs) {
|
||||
Ref Result = PSADBWOpImpl(Op, Op->Src[0], Op->Src[1]);
|
||||
const auto Size = GetSrcSize(Op);
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
Ref Result = PSADBWOpImpl(Size, Src1, Src2);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
@ -4461,8 +4485,7 @@ void OpDispatchBuilder::VDPPOp(OpcodeArgs) {
|
||||
template void OpDispatchBuilder::VDPPOp<4>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VDPPOp<8>(OpcodeArgs);
|
||||
|
||||
Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op,
|
||||
const X86Tables::DecodedOperand& ImmOp) {
|
||||
Ref OpDispatchBuilder::MPSADBWOpImpl(size_t SrcSize, Ref Src1, Ref Src2, uint8_t Select) {
|
||||
const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) {
|
||||
// Src2 will grab a 32bit element and duplicate it across the 128bits
|
||||
Ref DupSrc = _VDupElement(16, 4, Src2, Selector_Src2);
|
||||
@ -4529,18 +4552,12 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
|
||||
return _VAddP(16, 2, TmpTranspose1, TmpTranspose2);
|
||||
};
|
||||
|
||||
LOGMAN_THROW_A_FMT(ImmOp.IsLiteral(), "ImmOp needs to be literal here");
|
||||
const uint8_t Select = ImmOp.Data.Literal.Value;
|
||||
const uint8_t SrcSize = GetSrcSize(Op);
|
||||
const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
|
||||
|
||||
// Src1 needs to be in byte offset
|
||||
const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8;
|
||||
const uint8_t Select_Src2_Low = Select & 0b11;
|
||||
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Src1Op, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Src2Op, Op->Flags);
|
||||
|
||||
Ref Lower = LaneHelper(Select_Src1_Low, Select_Src2_Low, Src1, Src2);
|
||||
if (Is128Bit) {
|
||||
return Lower;
|
||||
@ -4556,12 +4573,27 @@ Ref OpDispatchBuilder::MPSADBWOpImpl(OpcodeArgs, const X86Tables::DecodedOperand
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) {
|
||||
Ref Result = MPSADBWOpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1]);
|
||||
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "ImmOp needs to be literal here");
|
||||
|
||||
const uint8_t Select = Op->Src[1].Data.Literal.Value;
|
||||
const uint8_t SrcSize = GetSrcSize(Op);
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) {
|
||||
Ref Result = MPSADBWOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
|
||||
LOGMAN_THROW_A_FMT(Op->Src[2].IsLiteral(), "ImmOp needs to be literal here");
|
||||
|
||||
const uint8_t Select = Op->Src[2].Data.Literal.Value;
|
||||
const uint8_t SrcSize = GetSrcSize(Op);
|
||||
Ref Src1 = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
Ref Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags);
|
||||
|
||||
|
||||
Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
@ -4707,7 +4739,7 @@ void OpDispatchBuilder::VPERMQOp(OpcodeArgs) {
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
static Ref VBLENDOpImpl(IREmitter& IR, uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) {
|
||||
Ref OpDispatchBuilder::VBLENDOpImpl(uint32_t VecSize, uint32_t ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) {
|
||||
const std::array Sources {Src1, Src2};
|
||||
|
||||
Ref Result = ZeroRegister;
|
||||
@ -4715,7 +4747,7 @@ static Ref VBLENDOpImpl(IREmitter& IR, uint32_t VecSize, uint32_t ElementSize, R
|
||||
for (int i = 0; i < NumElements; i++) {
|
||||
const auto SelectorIndex = (Selector >> i) & 1;
|
||||
|
||||
Result = IR._VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]);
|
||||
Result = _VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]);
|
||||
}
|
||||
|
||||
return Result;
|
||||
@ -4744,7 +4776,7 @@ void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) {
|
||||
}
|
||||
|
||||
const auto ZeroRegister = LoadZeroVector(DstSize);
|
||||
Ref Result = VBLENDOpImpl(*this, DstSize, 8, Src1, Src2, ZeroRegister, Selector);
|
||||
Ref Result = VBLENDOpImpl(DstSize, 8, Src1, Src2, ZeroRegister, Selector);
|
||||
StoreResult(FPRClass, Op, Result, -1);
|
||||
}
|
||||
|
||||
@ -4787,7 +4819,7 @@ void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) {
|
||||
}
|
||||
|
||||
const auto ZeroRegister = LoadZeroVector(DstSize);
|
||||
Ref Result = VBLENDOpImpl(*this, DstSize, 4, Src1, Src2, ZeroRegister, Selector);
|
||||
Ref Result = VBLENDOpImpl(DstSize, 4, Src1, Src2, ZeroRegister, Selector);
|
||||
if (!Is256Bit) {
|
||||
Result = _VMov(16, Result);
|
||||
}
|
||||
@ -4821,7 +4853,7 @@ void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) {
|
||||
const auto NewSelector = Selector << 8 | Selector;
|
||||
|
||||
const auto ZeroRegister = LoadZeroVector(DstSize);
|
||||
Ref Result = VBLENDOpImpl(*this, DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
|
||||
Ref Result = VBLENDOpImpl(DstSize, 2, Src1, Src2, ZeroRegister, NewSelector);
|
||||
if (Is128Bit) {
|
||||
Result = _VMov(16, Result);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user