AVX256: Initial fixes just to get my unittest working

This is the initial split to decouple AVX256 composed operations from
their MMX/SSE counterparts. This is to work around the subtle
differences with AVX/SSE zext/insert behaviour.
This commit is contained in:
Ryan Houdek 2024-07-08 06:47:58 -07:00
parent 3d90d1ab4f
commit 7e8d734e43
No known key found for this signature in database
3 changed files with 56 additions and 20 deletions

View File

@ -4475,7 +4475,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
_StoreContext(OpSize, Class, Src, offsetof(FEXCore::Core::CPUState, mm[gpr - FEXCore::X86State::REG_MM_0]));
} else if (gpr >= FEXCore::X86State::REG_XMM_0) {
const auto gprIndex = gpr - X86State::REG_XMM_0;
const auto VectorSize = (CTX->HostFeatures.SupportsSVE256 && CTX->HostFeatures.SupportsAVX) ? 32 : 16;
const auto VectorSize = GetGuestVectorLength();
auto Result = Src;
if (OpSize != VectorSize) {
@ -5145,7 +5145,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::VPACKSSOp<4>},
{OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::VPUNPCKLOp<8>},
{OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::VPUNPCKHOp<8>},
{OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::AVX>},
{OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
@ -5165,8 +5165,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0x7D), 1, &OpDispatchBuilder::VHSUBPOp<8>},
{OPD(1, 0b11, 0x7D), 1, &OpDispatchBuilder::VHSUBPOp<4>},
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::MOVQOp},
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::AVX>},
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::AVX>},
{OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
{OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
@ -5190,7 +5190,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(1, 0b01, 0xD3), 1, &OpDispatchBuilder::VPSRLDOp<8>},
{OPD(1, 0b01, 0xD4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VADD, 8>},
{OPD(1, 0b01, 0xD5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VMUL, 2>},
{OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::MOVQOp},
{OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::AVX>},
{OPD(1, 0b01, 0xD7), 1, &OpDispatchBuilder::MOVMSKOpOne},
{OPD(1, 0b01, 0xD8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUQSUB, 1>},
@ -5602,9 +5602,9 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
{0x3F, 1, &OpDispatchBuilder::ThunkOp},
{0x40, 16, &OpDispatchBuilder::CMOVOp},
{0x6E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{0x6E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::MMX>},
{0x6F, 1, &OpDispatchBuilder::MOVQMMXOp},
{0x7E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::MMX>},
{0x7F, 1, &OpDispatchBuilder::MOVQMMXOp},
{0x80, 16, &OpDispatchBuilder::CondJUMPOp},
{0x90, 16, &OpDispatchBuilder::SETccOp},
@ -5884,7 +5884,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
{0x5F, 1, &OpDispatchBuilder::VectorScalarInsertALUOp<IR::OP_VFMAXSCALARINSERT, 4>},
{0x6F, 1, &OpDispatchBuilder::MOVVectorUnalignedOp},
{0x70, 1, &OpDispatchBuilder::PSHUFWOp<false>},
{0x7E, 1, &OpDispatchBuilder::MOVQOp},
{0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::SSE>},
{0x7F, 1, &OpDispatchBuilder::MOVVectorUnalignedOp},
{0xB8, 1, &OpDispatchBuilder::PopcountOp},
{0xBC, 1, &OpDispatchBuilder::TZCNT},
@ -5964,7 +5964,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
{0x6B, 1, &OpDispatchBuilder::PACKSSOp<4>},
{0x6C, 1, &OpDispatchBuilder::PUNPCKLOp<8>},
{0x6D, 1, &OpDispatchBuilder::PUNPCKHOp<8>},
{0x6E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{0x6E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::SSE>},
{0x6F, 1, &OpDispatchBuilder::MOVVectorAlignedOp},
{0x70, 1, &OpDispatchBuilder::PSHUFDOp},
@ -5974,7 +5974,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
{0x78, 1, nullptr}, // GROUP 17
{0x7C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADDP, 8>},
{0x7D, 1, &OpDispatchBuilder::HSUBP<8>},
{0x7E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
{0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::SSE>},
{0x7F, 1, &OpDispatchBuilder::MOVVectorAlignedOp},
{0xC2, 1, &OpDispatchBuilder::VFCMPOp<8>},
{0xC4, 1, &OpDispatchBuilder::PINSROp<2>},
@ -5987,7 +5987,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
{0xD3, 1, &OpDispatchBuilder::PSRLDOp<8>},
{0xD4, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, 8>},
{0xD5, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VMUL, 2>},
{0xD6, 1, &OpDispatchBuilder::MOVQOp},
{0xD6, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::SSE>},
{0xD7, 1, &OpDispatchBuilder::MOVMSKOpOne}, // PMOVMSKB
{0xD8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, 1>},
{0xD9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, 2>},

View File

@ -432,6 +432,11 @@ public:
void SGDTOp(OpcodeArgs);
void SMSWOp(OpcodeArgs);
enum class VectorOpType {
MMX,
SSE,
AVX,
};
// SSE
void MOVLPOp(OpcodeArgs);
void MOVHPDOp(OpcodeArgs);
@ -445,7 +450,7 @@ public:
template<FEXCore::IR::IROps IROp, size_t ElementSize>
void VectorUnaryDuplicateOp(OpcodeArgs);
void MOVQOp(OpcodeArgs);
void MOVQOp(OpcodeArgs, VectorOpType VectorType);
void MOVQMMXOp(OpcodeArgs);
template<size_t ElementSize>
void MOVMSKOp(OpcodeArgs);
@ -489,7 +494,7 @@ public:
template<size_t SrcElementSize, bool Narrow, bool HostRoundingMode>
void XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs);
void MASKMOVOp(OpcodeArgs);
void MOVBetweenGPR_FPR(OpcodeArgs);
void MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType);
void TZCNT(OpcodeArgs);
void LZCNT(OpcodeArgs);
template<size_t ElementSize>
@ -1171,6 +1176,36 @@ public:
void AVX128_VCVTPS2PH(OpcodeArgs);
// End of AVX 128-bit implementation
// AVX 256-bit operations
void StoreResult_WithAVXInsert(VectorOpType Type, FEXCore::IR::RegisterClassType Class, FEXCore::X86Tables::DecodedOp Op, Ref Value,
int8_t Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) {
if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= X86State::REG_XMM_0 && Op->Dest.Data.GPR.GPR <= X86State::REG_XMM_15 &&
GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
const auto gpr = Op->Dest.Data.GPR.GPR;
const auto gprIndex = gpr - X86State::REG_XMM_0;
auto DestVector = LoadXMMRegister(gprIndex);
Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value);
StoreXMMRegister(gprIndex, Value);
return;
}
StoreResult(Class, Op, Value, Align, AccessType);
}
void StoreXMMRegister_WithAVXInsert(VectorOpType Type, uint32_t XMM, Ref Value) {
if (GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
///< SSE vector stores need to insert in the low 128-bit lane of the 256-bit register.
auto DestVector = LoadXMMRegister(XMM);
Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value);
StoreXMMRegister(XMM, Value);
return;
}
StoreXMMRegister(XMM, Value);
}
// End of AVX 256-bit implementation
void InvalidOp(OpcodeArgs);
void SetPackedRFLAG(bool Lower8, Ref Src);

View File

@ -681,7 +681,7 @@ void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) {
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, 4>(OpcodeArgs);
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, 4>(OpcodeArgs);
void OpDispatchBuilder::MOVQOp(OpcodeArgs) {
void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) {
const auto SrcSize = Op->Src[0].IsGPR() ? 16U : GetSrcSize(Op);
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
// This instruction is a bit special that if the destination is a register then it'll ZEXT the 64bit source to 128bit
@ -690,7 +690,7 @@ void OpDispatchBuilder::MOVQOp(OpcodeArgs) {
const auto gprIndex = gpr - X86State::REG_XMM_0;
auto Reg = _VMov(8, Src);
StoreXMMRegister(gprIndex, Reg);
StoreXMMRegister_WithAVXInsert(VectorType, gprIndex, Reg);
} else {
// This is simple, just store the result
StoreResult(FPRClass, Op, Src, -1);
@ -2327,19 +2327,20 @@ void OpDispatchBuilder::VPMASKMOVOp(OpcodeArgs) {
template void OpDispatchBuilder::VPMASKMOVOp<false>(OpcodeArgs);
template void OpDispatchBuilder::VPMASKMOVOp<true>(OpcodeArgs);
void OpDispatchBuilder::MOVBetweenGPR_FPR(OpcodeArgs) {
void OpDispatchBuilder::MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType) {
if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= FEXCore::X86State::REG_XMM_0) {
Ref Result {};
if (Op->Src[0].IsGPR()) {
// Loading from GPR and moving to Vector.
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], CTX->GetGPRSize(), Op->Flags);
// zext to 128bit
auto Converted = _VCastFromGPR(16, GetSrcSize(Op), Src);
StoreResult(FPRClass, Op, Op->Dest, Converted, -1);
Result = _VCastFromGPR(16, GetSrcSize(Op), Src);
} else {
// Loading from Memory as a scalar. Zero extend
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
StoreResult(FPRClass, Op, Op->Dest, Src, -1);
Result = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
}
StoreResult_WithAVXInsert(VectorType, FPRClass, Op, Result, -1);
} else {
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);