From a045c917e74474ffc211e6f3b161537122e2f47b Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Sat, 9 Aug 2025 19:48:33 -0500 Subject: [PATCH] Common: Switch simd mov to auto SSE/AVX --- common/emitter/implement/simd_moremovs.h | 27 +---- common/emitter/instructions.h | 6 - common/emitter/simd.cpp | 108 +++++++++++------- .../common/x86emitter/codegen_tests_main.cpp | 64 ++++++++++- 4 files changed, 129 insertions(+), 76 deletions(-) diff --git a/common/emitter/implement/simd_moremovs.h b/common/emitter/implement/simd_moremovs.h index dc82e77ab4..3b6205d2ae 100644 --- a/common/emitter/implement/simd_moremovs.h +++ b/common/emitter/implement/simd_moremovs.h @@ -44,7 +44,7 @@ namespace x86Emitter // -------------------------------------------------------------------------------------- // xImplSimd_MoveSSE // -------------------------------------------------------------------------------------- - // Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD + // Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD / MOVDQA / MOVDQU // // All implementations of Unaligned Movs will, when possible, use aligned movs instead. // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement @@ -52,27 +52,10 @@ namespace x86Emitter // struct xImplSimd_MoveSSE { - u8 Prefix; - bool isAligned; - - void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const; - void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const; - void operator()(const xIndirectVoid& to, const xRegisterSSE& from) const; - }; - - // -------------------------------------------------------------------------------------- - // xImplSimd_MoveDQ - // -------------------------------------------------------------------------------------- - // Implementations for MOVDQA / MOVDQU - // - // All implementations of Unaligned Movs will, when possible, use aligned movs instead. - // This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement - // which can be checked for alignment at runtime. - - struct xImplSimd_MoveDQ - { - u8 Prefix; - bool isAligned; + SIMDInstructionInfo aligned_load; + SIMDInstructionInfo aligned_store; + SIMDInstructionInfo unaligned_load; + SIMDInstructionInfo unaligned_store; void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const; void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const; diff --git a/common/emitter/instructions.h b/common/emitter/instructions.h index 3e5fff270b..4b1d4ffcdf 100644 --- a/common/emitter/instructions.h +++ b/common/emitter/instructions.h @@ -486,14 +486,8 @@ namespace x86Emitter extern const xImplSimd_MoveSSE xMOVUPS; extern const xImplSimd_MoveSSE xMOVAPD; extern const xImplSimd_MoveSSE xMOVUPD; - -#ifdef ALWAYS_USE_MOVAPS extern const xImplSimd_MoveSSE xMOVDQA; extern const xImplSimd_MoveSSE xMOVDQU; -#else - extern const xImplSimd_MoveDQ xMOVDQA; - extern const xImplSimd_MoveDQ xMOVDQU; -#endif extern const xImplSimd_MovHL xMOVH; extern const xImplSimd_MovHL xMOVL; diff --git a/common/emitter/simd.cpp b/common/emitter/simd.cpp index d3fff1b09d..689ed3acfd 100644 --- a/common/emitter/simd.cpp +++ b/common/emitter/simd.cpp @@ -647,53 +647,55 @@ namespace x86Emitter void xImplSimd_MovHL_RtoR::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info, dst, src1, src2); } void xImplSimd_MovHL_RtoR::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info.p66(), dst, src1, src2); } - static const u16 MovPS_OpAligned = 0x28; // Aligned [aps] form - static const u16 MovPS_OpUnaligned = 0x10; // unaligned [ups] form - - void xImplSimd_MoveSSE::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const + static bool IsAligned(const xRegisterSSE& reg, const xIndirectVoid& mem) { - if (to != from) - xOpWrite0F(Prefix, MovPS_OpAligned, to, from); + u32 mask = reg.GetOperandSize() - 1; + // Aligned if it's displacement-only and the displacement is aligned + if (mem.Displacement & mask) + return false; + return mem.Index.IsEmpty() && mem.Base.IsEmpty(); } - void xImplSimd_MoveSSE::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const + static const xImplSimd_MoveSSE& GetLoadStoreOp(const xImplSimd_MoveSSE* op) { - // ModSib form is aligned if it's displacement-only and the displacement is aligned: - bool isReallyAligned = isAligned || (((from.Displacement & 0x0f) == 0) && from.Index.IsEmpty() && from.Base.IsEmpty()); - - xOpWrite0F(Prefix, isReallyAligned ? MovPS_OpAligned : MovPS_OpUnaligned, to, from); + if (!x86Emitter::use_avx) + { + // movaps is shorter, and no processor differentiates between the various movs for load/store + const bool aligned = std::bit_cast(op->aligned_load) == std::bit_cast(op->unaligned_load); + return aligned ? xMOVAPS : xMOVUPS; + } + return *op; } - void xImplSimd_MoveSSE::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const + void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xRegisterSSE& src) const { - // ModSib form is aligned if it's displacement-only and the displacement is aligned: - bool isReallyAligned = isAligned || ((to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty()); - xOpWrite0F(Prefix, isReallyAligned ? MovPS_OpAligned + 1 : MovPS_OpUnaligned + 1, from, to); + if (dst.GetId() == src.GetId() && dst.GetOperandSize() == src.GetOperandSize()) + return; + SIMDInstructionInfo info = aligned_load; + const xRegisterSSE* arg0 = &dst; + const xRegisterSSE* arg1 = &src; + if (x86Emitter::use_avx) + { + if (arg1->IsExtended() && !arg0->IsExtended()) + { + // Can save a byte by using the store opcode + info = aligned_store; + std::swap(arg0, arg1); + } + } + EmitSIMD(info, *arg0, *arg0, *arg1); } - static const u8 MovDQ_PrefixAligned = 0x66; // Aligned [dqa] form - static const u8 MovDQ_PrefixUnaligned = 0xf3; // unaligned [dqu] form - - void xImplSimd_MoveDQ::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const + void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xIndirectVoid& src) const { - if (to != from) - xOpWrite0F(MovDQ_PrefixAligned, 0x6f, to, from); + const xImplSimd_MoveSSE& op = GetLoadStoreOp(this); + EmitSIMD(IsAligned(dst, src) ? op.aligned_load : op.unaligned_load, dst, dst, src); } - void xImplSimd_MoveDQ::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const + void xImplSimd_MoveSSE::operator()(const xIndirectVoid& dst, const xRegisterSSE& src) const { - // ModSib form is aligned if it's displacement-only and the displacement is aligned: - bool isReallyAligned = isAligned || ((from.Displacement & 0x0f) == 0 && from.Index.IsEmpty() && from.Base.IsEmpty()); - xOpWrite0F(isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x6f, to, from); - } - - void xImplSimd_MoveDQ::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const - { - // ModSib form is aligned if it's displacement-only and the displacement is aligned: - bool isReallyAligned = isAligned || ((to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty()); - - // use opcode 0x7f : alternate ModRM encoding (reverse src/dst) - xOpWrite0F(isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x7f, from, to); + const xImplSimd_MoveSSE& op = GetLoadStoreOp(this); + EmitSIMD(IsAligned(src, dst) ? aligned_store : op.unaligned_store, src, src, dst); } void xImplSimd_PMove::BW(const xRegisterSSE& to, const xRegisterSSE& from) const { OpWriteSSE(0x66, OpcodeBase); } @@ -715,21 +717,39 @@ namespace x86Emitter void xImplSimd_PMove::DQ(const xRegisterSSE& to, const xIndirect64& from) const { OpWriteSSE(0x66, OpcodeBase + 0x500); } - const xImplSimd_MoveSSE xMOVAPS = {0x00, true}; - const xImplSimd_MoveSSE xMOVUPS = {0x00, false}; + const xImplSimd_MoveSSE xMOVAPS = { + SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(), + SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(), + }; + const xImplSimd_MoveSSE xMOVUPS = { + SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(), + SIMDInstructionInfo(0x10).mov(), SIMDInstructionInfo(0x11).mov(), + }; #ifdef ALWAYS_USE_MOVAPS - const xImplSimd_MoveSSE xMOVDQA = {0x00, true}; - const xImplSimd_MoveSSE xMOVAPD = {0x00, true}; + const xImplSimd_MoveSSE xMOVDQA = xMOVAPS; + const xImplSimd_MoveSSE xMOVAPD = xMOVAPS; - const xImplSimd_MoveSSE xMOVDQU = {0x00, false}; - const xImplSimd_MoveSSE xMOVUPD = {0x00, false}; + const xImplSimd_MoveSSE xMOVDQU = xMOVUPS; + const xImplSimd_MoveSSE xMOVUPD = xMOVUPS; #else - const xImplSimd_MoveDQ xMOVDQA = {0x66, true}; - const xImplSimd_MoveSSE xMOVAPD = {0x66, true}; + const xImplSimd_MoveSSE xMOVDQA = { + SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(), + SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(), + }; + const xImplSimd_MoveSSE xMOVDQU = { + SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(), + SIMDInstructionInfo(0x6f).pf3().mov(), SIMDInstructionInfo(0x7f).pf3().mov(), + }; - const xImplSimd_MoveDQ xMOVDQU = {0xf3, false}; - const xImplSimd_MoveSSE xMOVUPD = {0x66, false}; + const xImplSimd_MoveSSE xMOVAPD = { + SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(), + SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(), + }; + const xImplSimd_MoveSSE xMOVUPD = { + SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(), + SIMDInstructionInfo(0x10).p66().mov(), SIMDInstructionInfo(0x11).p66().mov(), + }; #endif diff --git a/tests/ctest/common/x86emitter/codegen_tests_main.cpp b/tests/ctest/common/x86emitter/codegen_tests_main.cpp index eb691aa920..bb678e0dab 100644 --- a/tests/ctest/common/x86emitter/codegen_tests_main.cpp +++ b/tests/ctest/common/x86emitter/codegen_tests_main.cpp @@ -337,10 +337,32 @@ TEST(CodegenTests, SSETest) CODEGEN_TEST(xMOVHL.PS(xmm4, xmm9), "41 0f 12 e1"); CODEGEN_TEST(xMOVLH.PS(xmm2, xmm1), "0f 16 d1"); - CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1"); - CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1"); - CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08"); - CODEGEN_TEST(xMOVAPS(ptr128[rax+r9], xmm8), "46 0f 29 04 08"); + CODEGEN_TEST(xMOVAPS(xmm0, xmm8), "41 0f 28 c0"); + CODEGEN_TEST(xMOVUPS(xmm8, xmm3), "44 0f 28 c3"); + CODEGEN_TEST(xMOVAPS(ptr[r8], xmm4), "41 0f 29 20"); + CODEGEN_TEST(xMOVUPS(ptr[rax], xmm5), "0f 11 28"); + CODEGEN_TEST(xMOVAPS(xmm8, ptr[r8]), "45 0f 28 00"); + CODEGEN_TEST(xMOVUPS(xmm5, ptr[r9]), "41 0f 10 29"); + CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "44 0f 29 01"); + CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "45 0f 11 18"); + CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "45 0f 28 39"); + CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "0f 10 08"); + CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0), "41 0f 29 01"); + CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3), "41 0f 11 18"); + CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "44 0f 28 06"); + CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "0f 10 39"); +#ifdef ALWAYS_USE_MOVAPS + CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "41 0f 28 e0"); + CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "0f 28 cc"); + CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "45 0f 28 cb"); + CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "41 0f 28 fa"); +#else + CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "66 41 0f 28 e0"); + CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "66 0f 28 cc"); + CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "66 45 0f 6f cb"); + CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "66 41 0f 6f fa"); +#endif + CODEGEN_TEST(xBLEND.PS(xmm0, xmm1, 0x55), "66 0f 3a 0c c1 55"); CODEGEN_TEST(xBLEND.PD(xmm8, xmm9, 0xaa), "66 45 0f 3a 0d c1 aa"); CODEGEN_TEST(xPBLEND.W(xmm0, xmm1, 0x55), "66 0f 3a 0e c1 55"); @@ -545,6 +567,40 @@ TEST(CodegenTests, AVXTest) CODEGEN_TEST(xMOVHL.PS(xmm4, xmm9), "c4 c1 58 12 e1"); CODEGEN_TEST(xMOVLH.PS(xmm2, xmm1), "c5 e8 16 d1"); + CODEGEN_TEST(xMOVAPS(xmm0, xmm8), "c5 78 29 c0"); + CODEGEN_TEST(xMOVUPS(xmm8, xmm3), "c5 78 28 c3"); + CODEGEN_TEST(xMOVAPS(ptr[r8], xmm4), "c4 c1 78 29 20"); + CODEGEN_TEST(xMOVUPS(ptr[rax], xmm5), "c5 f8 11 28"); + CODEGEN_TEST(xMOVAPS(xmm8, ptr[r8]), "c4 41 78 28 00"); + CODEGEN_TEST(xMOVUPS(xmm5, ptr[r9]), "c4 c1 78 10 29"); +#ifdef ALWAYS_USE_MOVAPS + CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "c5 78 29 c4"); + CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "c5 f8 28 cc"); + CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "c5 78 29 01"); + CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "c4 41 78 11 18"); + CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "c4 41 78 28 39"); + CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "c5 f8 10 08"); + CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "c4 41 78 28 cb"); + CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "c5 78 29 d7"); + CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0), "c4 c1 78 29 01"); + CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3), "c4 c1 78 11 18"); + CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "c5 78 28 06"); + CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "c5 f8 10 39"); +#else + CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "c5 79 29 c4"); + CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "c5 f9 28 cc"); + CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "c5 79 29 01"); + CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "c4 41 79 11 18"); + CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "c4 41 79 28 39"); + CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "c5 f9 10 08"); + CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "c4 41 79 6f cb"); + CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "c5 79 7f d7"); + CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0), "c4 c1 79 7f 01"); + CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3), "c4 c1 7a 7f 18"); + CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "c5 79 6f 06"); + CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "c5 fa 6f 39"); +#endif + CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1"); CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07"); CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");