mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-03-04 12:39:22 +00:00
Merge pull request #3844 from Sonicadvance1/fix_vmovq
AVX128: Fixes vmovq loading too much data
This commit is contained in:
commit
b9a6caea8d
@ -4475,7 +4475,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
|
||||
_StoreContext(OpSize, Class, Src, offsetof(FEXCore::Core::CPUState, mm[gpr - FEXCore::X86State::REG_MM_0]));
|
||||
} else if (gpr >= FEXCore::X86State::REG_XMM_0) {
|
||||
const auto gprIndex = gpr - X86State::REG_XMM_0;
|
||||
const auto VectorSize = (CTX->HostFeatures.SupportsSVE256 && CTX->HostFeatures.SupportsAVX) ? 32 : 16;
|
||||
const auto VectorSize = GetGuestVectorLength();
|
||||
|
||||
auto Result = Src;
|
||||
if (OpSize != VectorSize) {
|
||||
@ -5145,7 +5145,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
|
||||
{OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::VPACKSSOp<4>},
|
||||
{OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::VPUNPCKLOp<8>},
|
||||
{OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::VPUNPCKHOp<8>},
|
||||
{OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::AVX>},
|
||||
|
||||
{OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
|
||||
{OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
|
||||
@ -5165,8 +5165,8 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
|
||||
{OPD(1, 0b01, 0x7D), 1, &OpDispatchBuilder::VHSUBPOp<8>},
|
||||
{OPD(1, 0b11, 0x7D), 1, &OpDispatchBuilder::VHSUBPOp<4>},
|
||||
|
||||
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::MOVQOp},
|
||||
{OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::AVX>},
|
||||
{OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::AVX>},
|
||||
|
||||
{OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp},
|
||||
{OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp},
|
||||
@ -5190,7 +5190,7 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
|
||||
{OPD(1, 0b01, 0xD3), 1, &OpDispatchBuilder::VPSRLDOp<8>},
|
||||
{OPD(1, 0b01, 0xD4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VADD, 8>},
|
||||
{OPD(1, 0b01, 0xD5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VMUL, 2>},
|
||||
{OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::MOVQOp},
|
||||
{OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::AVX>},
|
||||
{OPD(1, 0b01, 0xD7), 1, &OpDispatchBuilder::MOVMSKOpOne},
|
||||
|
||||
{OPD(1, 0b01, 0xD8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUQSUB, 1>},
|
||||
@ -5602,9 +5602,9 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
|
||||
|
||||
{0x3F, 1, &OpDispatchBuilder::ThunkOp},
|
||||
{0x40, 16, &OpDispatchBuilder::CMOVOp},
|
||||
{0x6E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{0x6E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::MMX>},
|
||||
{0x6F, 1, &OpDispatchBuilder::MOVQMMXOp},
|
||||
{0x7E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::MMX>},
|
||||
{0x7F, 1, &OpDispatchBuilder::MOVQMMXOp},
|
||||
{0x80, 16, &OpDispatchBuilder::CondJUMPOp},
|
||||
{0x90, 16, &OpDispatchBuilder::SETccOp},
|
||||
@ -5884,7 +5884,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
|
||||
{0x5F, 1, &OpDispatchBuilder::VectorScalarInsertALUOp<IR::OP_VFMAXSCALARINSERT, 4>},
|
||||
{0x6F, 1, &OpDispatchBuilder::MOVVectorUnalignedOp},
|
||||
{0x70, 1, &OpDispatchBuilder::PSHUFWOp<false>},
|
||||
{0x7E, 1, &OpDispatchBuilder::MOVQOp},
|
||||
{0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::SSE>},
|
||||
{0x7F, 1, &OpDispatchBuilder::MOVVectorUnalignedOp},
|
||||
{0xB8, 1, &OpDispatchBuilder::PopcountOp},
|
||||
{0xBC, 1, &OpDispatchBuilder::TZCNT},
|
||||
@ -5964,7 +5964,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
|
||||
{0x6B, 1, &OpDispatchBuilder::PACKSSOp<4>},
|
||||
{0x6C, 1, &OpDispatchBuilder::PUNPCKLOp<8>},
|
||||
{0x6D, 1, &OpDispatchBuilder::PUNPCKHOp<8>},
|
||||
{0x6E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{0x6E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::SSE>},
|
||||
{0x6F, 1, &OpDispatchBuilder::MOVVectorAlignedOp},
|
||||
{0x70, 1, &OpDispatchBuilder::PSHUFDOp},
|
||||
|
||||
@ -5974,7 +5974,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
|
||||
{0x78, 1, nullptr}, // GROUP 17
|
||||
{0x7C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADDP, 8>},
|
||||
{0x7D, 1, &OpDispatchBuilder::HSUBP<8>},
|
||||
{0x7E, 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},
|
||||
{0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::SSE>},
|
||||
{0x7F, 1, &OpDispatchBuilder::MOVVectorAlignedOp},
|
||||
{0xC2, 1, &OpDispatchBuilder::VFCMPOp<8>},
|
||||
{0xC4, 1, &OpDispatchBuilder::PINSROp<2>},
|
||||
@ -5987,7 +5987,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
|
||||
{0xD3, 1, &OpDispatchBuilder::PSRLDOp<8>},
|
||||
{0xD4, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, 8>},
|
||||
{0xD5, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VMUL, 2>},
|
||||
{0xD6, 1, &OpDispatchBuilder::MOVQOp},
|
||||
{0xD6, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::SSE>},
|
||||
{0xD7, 1, &OpDispatchBuilder::MOVMSKOpOne}, // PMOVMSKB
|
||||
{0xD8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, 1>},
|
||||
{0xD9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, 2>},
|
||||
|
@ -432,6 +432,11 @@ public:
|
||||
void SGDTOp(OpcodeArgs);
|
||||
void SMSWOp(OpcodeArgs);
|
||||
|
||||
enum class VectorOpType {
|
||||
MMX,
|
||||
SSE,
|
||||
AVX,
|
||||
};
|
||||
// SSE
|
||||
void MOVLPOp(OpcodeArgs);
|
||||
void MOVHPDOp(OpcodeArgs);
|
||||
@ -445,7 +450,7 @@ public:
|
||||
template<FEXCore::IR::IROps IROp, size_t ElementSize>
|
||||
void VectorUnaryDuplicateOp(OpcodeArgs);
|
||||
|
||||
void MOVQOp(OpcodeArgs);
|
||||
void MOVQOp(OpcodeArgs, VectorOpType VectorType);
|
||||
void MOVQMMXOp(OpcodeArgs);
|
||||
template<size_t ElementSize>
|
||||
void MOVMSKOp(OpcodeArgs);
|
||||
@ -489,7 +494,7 @@ public:
|
||||
template<size_t SrcElementSize, bool Narrow, bool HostRoundingMode>
|
||||
void XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs);
|
||||
void MASKMOVOp(OpcodeArgs);
|
||||
void MOVBetweenGPR_FPR(OpcodeArgs);
|
||||
void MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType);
|
||||
void TZCNT(OpcodeArgs);
|
||||
void LZCNT(OpcodeArgs);
|
||||
template<size_t ElementSize>
|
||||
@ -1171,6 +1176,36 @@ public:
|
||||
void AVX128_VCVTPS2PH(OpcodeArgs);
|
||||
|
||||
// End of AVX 128-bit implementation
|
||||
|
||||
// AVX 256-bit operations
|
||||
void StoreResult_WithAVXInsert(VectorOpType Type, FEXCore::IR::RegisterClassType Class, FEXCore::X86Tables::DecodedOp Op, Ref Value,
|
||||
int8_t Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) {
|
||||
if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= X86State::REG_XMM_0 && Op->Dest.Data.GPR.GPR <= X86State::REG_XMM_15 &&
|
||||
GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
|
||||
const auto gpr = Op->Dest.Data.GPR.GPR;
|
||||
const auto gprIndex = gpr - X86State::REG_XMM_0;
|
||||
auto DestVector = LoadXMMRegister(gprIndex);
|
||||
Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value);
|
||||
StoreXMMRegister(gprIndex, Value);
|
||||
return;
|
||||
}
|
||||
|
||||
StoreResult(Class, Op, Value, Align, AccessType);
|
||||
}
|
||||
|
||||
void StoreXMMRegister_WithAVXInsert(VectorOpType Type, uint32_t XMM, Ref Value) {
|
||||
if (GetGuestVectorLength() == Core::CPUState::XMM_AVX_REG_SIZE && Type == VectorOpType::SSE) {
|
||||
///< SSE vector stores need to insert in the low 128-bit lane of the 256-bit register.
|
||||
auto DestVector = LoadXMMRegister(XMM);
|
||||
Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value);
|
||||
StoreXMMRegister(XMM, Value);
|
||||
return;
|
||||
}
|
||||
StoreXMMRegister(XMM, Value);
|
||||
}
|
||||
|
||||
// End of AVX 256-bit implementation
|
||||
|
||||
void InvalidOp(OpcodeArgs);
|
||||
|
||||
void SetPackedRFLAG(bool Lower8, Ref Src);
|
||||
|
@ -843,7 +843,13 @@ void OpDispatchBuilder::AVX128_MOVVectorNT(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::AVX128_MOVQ(OpcodeArgs) {
|
||||
auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);
|
||||
RefPair Src {};
|
||||
if (Op->Src[0].IsGPR()) {
|
||||
Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);
|
||||
} else {
|
||||
Src.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], OpSize::i64Bit, Op->Flags);
|
||||
}
|
||||
|
||||
// This instruction is a bit special that if the destination is a register then it'll ZEXT the 64bit source to 256bit
|
||||
if (Op->Dest.IsGPR()) {
|
||||
// Zero bits [127:64] as well.
|
||||
|
@ -681,7 +681,7 @@ void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) {
|
||||
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, 4>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, 4>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::MOVQOp(OpcodeArgs) {
|
||||
void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) {
|
||||
const auto SrcSize = Op->Src[0].IsGPR() ? 16U : GetSrcSize(Op);
|
||||
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
|
||||
// This instruction is a bit special that if the destination is a register then it'll ZEXT the 64bit source to 128bit
|
||||
@ -690,7 +690,7 @@ void OpDispatchBuilder::MOVQOp(OpcodeArgs) {
|
||||
const auto gprIndex = gpr - X86State::REG_XMM_0;
|
||||
|
||||
auto Reg = _VMov(8, Src);
|
||||
StoreXMMRegister(gprIndex, Reg);
|
||||
StoreXMMRegister_WithAVXInsert(VectorType, gprIndex, Reg);
|
||||
} else {
|
||||
// This is simple, just store the result
|
||||
StoreResult(FPRClass, Op, Src, -1);
|
||||
@ -2327,19 +2327,20 @@ void OpDispatchBuilder::VPMASKMOVOp(OpcodeArgs) {
|
||||
template void OpDispatchBuilder::VPMASKMOVOp<false>(OpcodeArgs);
|
||||
template void OpDispatchBuilder::VPMASKMOVOp<true>(OpcodeArgs);
|
||||
|
||||
void OpDispatchBuilder::MOVBetweenGPR_FPR(OpcodeArgs) {
|
||||
void OpDispatchBuilder::MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType) {
|
||||
if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= FEXCore::X86State::REG_XMM_0) {
|
||||
Ref Result {};
|
||||
if (Op->Src[0].IsGPR()) {
|
||||
// Loading from GPR and moving to Vector.
|
||||
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], CTX->GetGPRSize(), Op->Flags);
|
||||
// zext to 128bit
|
||||
auto Converted = _VCastFromGPR(16, GetSrcSize(Op), Src);
|
||||
StoreResult(FPRClass, Op, Op->Dest, Converted, -1);
|
||||
Result = _VCastFromGPR(16, GetSrcSize(Op), Src);
|
||||
} else {
|
||||
// Loading from Memory as a scalar. Zero extend
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
StoreResult(FPRClass, Op, Op->Dest, Src, -1);
|
||||
Result = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
}
|
||||
|
||||
StoreResult_WithAVXInsert(VectorType, FPRClass, Op, Result, -1);
|
||||
} else {
|
||||
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
|
||||
|
||||
|
68
unittests/ASM/FEX_bugs/vmov_size_test.asm
Normal file
68
unittests/ASM/FEX_bugs/vmov_size_test.asm
Normal file
@ -0,0 +1,68 @@
|
||||
%ifdef CONFIG
|
||||
{
|
||||
"HostFeatures": ["AVX"],
|
||||
"RegData": {
|
||||
"XMM0": ["0x4142434445464748", "0", "0", "0"],
|
||||
"XMM1": ["0x4142434445464748", "0", "0x7172737475767778", "0x8182838485868788"],
|
||||
"XMM2": ["0x0000000041424344", "0", "0", "0"],
|
||||
"XMM3": ["0x0000000041424344", "0", "0x7172737475767778", "0x8182838485868788"]
|
||||
},
|
||||
"MemoryRegions": {
|
||||
"0x100000000": "4096"
|
||||
}
|
||||
}
|
||||
%endif
|
||||
|
||||
; FEX-Emu had a bug where vmovq was loading 128-bits worth of data instead of 64-bits.
|
||||
; This ensures that {v,}mov{d,q} all load the correct amount of data through a test that will fault if it loads too much.
|
||||
|
||||
; Address at the last eight bytes
|
||||
mov rax, 0x100000000 + 4096-8
|
||||
|
||||
; Address at the last 4 bytes
|
||||
mov rbx, 0x100000000 + 4096-4
|
||||
|
||||
mov rcx, 0x4142434445464748
|
||||
|
||||
; Store data using GPR
|
||||
mov [rax], rcx
|
||||
|
||||
; Setup vector with data
|
||||
vmovaps ymm0, [rel .data]
|
||||
vmovaps ymm1, [rel .data]
|
||||
vmovaps ymm2, [rel .data]
|
||||
vmovaps ymm3, [rel .data]
|
||||
|
||||
; 64-bit tests
|
||||
|
||||
; Load with vmovq to ensure we don't try loading too much data
|
||||
vmovq xmm0, qword [rax]
|
||||
|
||||
; Also test SSE2 version
|
||||
movq xmm1, qword [rax]
|
||||
|
||||
; Also test MOVQ stores
|
||||
vmovq qword [rax], xmm0
|
||||
|
||||
; Also test SSE2 version
|
||||
movq qword [rax], xmm1
|
||||
|
||||
; 32-bit tests
|
||||
; Load with vmovq to ensure we don't try loading too much data
|
||||
vmovd xmm2, dword [rbx]
|
||||
|
||||
; Also test SSE2 version
|
||||
movd xmm3, dword [rbx]
|
||||
|
||||
; Also test MOVD stores
|
||||
vmovd dword [rbx], xmm2
|
||||
|
||||
; Also test SSE2 version
|
||||
movd dword [rbx], xmm3
|
||||
|
||||
hlt
|
||||
|
||||
align 32
|
||||
.data:
|
||||
dq 0x5152535455565758, 0x6162636465666768
|
||||
dq 0x7172737475767778, 0x8182838485868788
|
@ -3872,13 +3872,12 @@
|
||||
]
|
||||
},
|
||||
"vmovq xmm0, qword [rax]": {
|
||||
"ExpectedInstructionCount": 4,
|
||||
"ExpectedInstructionCount": 3,
|
||||
"Comment": [
|
||||
"Map 1 0b01 0x6e 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"ldr q2, [x4]",
|
||||
"mov v16.8b, v2.8b",
|
||||
"ldr d16, [x4]",
|
||||
"movi v2.2d, #0x0",
|
||||
"str q2, [x28, #16]"
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user