From a045c917e74474ffc211e6f3b161537122e2f47b Mon Sep 17 00:00:00 2001
From: TellowKrinkle <tellowkrinkle@gmail.com>
Date: Sat, 9 Aug 2025 19:48:33 -0500
Subject: [PATCH] Common: Switch simd mov to auto SSE/AVX

---
 common/emitter/implement/simd_moremovs.h      |  27 +----
 common/emitter/instructions.h                 |   6 -
 common/emitter/simd.cpp                       | 108 +++++++++++-------
 .../common/x86emitter/codegen_tests_main.cpp  |  64 ++++++++++-
 4 files changed, 129 insertions(+), 76 deletions(-)

diff --git a/common/emitter/implement/simd_moremovs.h b/common/emitter/implement/simd_moremovs.h
index dc82e77ab4..3b6205d2ae 100644
--- a/common/emitter/implement/simd_moremovs.h
+++ b/common/emitter/implement/simd_moremovs.h
@@ -44,7 +44,7 @@ namespace x86Emitter
 	// --------------------------------------------------------------------------------------
 	//  xImplSimd_MoveSSE
 	// --------------------------------------------------------------------------------------
-	// Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD
+	// Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD / MOVDQA / MOVDQU
 	//
 	// All implementations of Unaligned Movs will, when possible, use aligned movs instead.
 	// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
@@ -52,27 +52,10 @@ namespace x86Emitter
 	//
 	struct xImplSimd_MoveSSE
 	{
-		u8 Prefix;
-		bool isAligned;
-
-		void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const;
-		void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const;
-		void operator()(const xIndirectVoid& to, const xRegisterSSE& from) const;
-	};
-
-	// --------------------------------------------------------------------------------------
-	//  xImplSimd_MoveDQ
-	// --------------------------------------------------------------------------------------
-	// Implementations for MOVDQA / MOVDQU
-	//
-	// All implementations of Unaligned Movs will, when possible, use aligned movs instead.
-	// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
-	// which can be checked for alignment at runtime.
-
-	struct xImplSimd_MoveDQ
-	{
-		u8 Prefix;
-		bool isAligned;
+		SIMDInstructionInfo aligned_load;
+		SIMDInstructionInfo aligned_store;
+		SIMDInstructionInfo unaligned_load;
+		SIMDInstructionInfo unaligned_store;
 
 		void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const;
 		void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const;
diff --git a/common/emitter/instructions.h b/common/emitter/instructions.h
index 3e5fff270b..4b1d4ffcdf 100644
--- a/common/emitter/instructions.h
+++ b/common/emitter/instructions.h
@@ -486,14 +486,8 @@ namespace x86Emitter
 	extern const xImplSimd_MoveSSE xMOVUPS;
 	extern const xImplSimd_MoveSSE xMOVAPD;
 	extern const xImplSimd_MoveSSE xMOVUPD;
-
-#ifdef ALWAYS_USE_MOVAPS
 	extern const xImplSimd_MoveSSE xMOVDQA;
 	extern const xImplSimd_MoveSSE xMOVDQU;
-#else
-	extern const xImplSimd_MoveDQ xMOVDQA;
-	extern const xImplSimd_MoveDQ xMOVDQU;
-#endif
 
 	extern const xImplSimd_MovHL xMOVH;
 	extern const xImplSimd_MovHL xMOVL;
diff --git a/common/emitter/simd.cpp b/common/emitter/simd.cpp
index d3fff1b09d..689ed3acfd 100644
--- a/common/emitter/simd.cpp
+++ b/common/emitter/simd.cpp
@@ -647,53 +647,55 @@ namespace x86Emitter
 	void xImplSimd_MovHL_RtoR::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info, dst, src1, src2); }
 	void xImplSimd_MovHL_RtoR::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info.p66(), dst, src1, src2); }
 
-	static const u16 MovPS_OpAligned = 0x28; // Aligned [aps] form
-	static const u16 MovPS_OpUnaligned = 0x10; // unaligned [ups] form
-
-	void xImplSimd_MoveSSE::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const
+	static bool IsAligned(const xRegisterSSE& reg, const xIndirectVoid& mem)
 	{
-		if (to != from)
-			xOpWrite0F(Prefix, MovPS_OpAligned, to, from);
+		u32 mask = reg.GetOperandSize() - 1;
+		// Aligned if it's displacement-only and the displacement is aligned
+		if (mem.Displacement & mask)
+			return false;
+		return mem.Index.IsEmpty() && mem.Base.IsEmpty();
 	}
 
-	void xImplSimd_MoveSSE::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const
+	static const xImplSimd_MoveSSE& GetLoadStoreOp(const xImplSimd_MoveSSE* op)
 	{
-		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
-		bool isReallyAligned = isAligned || (((from.Displacement & 0x0f) == 0) && from.Index.IsEmpty() && from.Base.IsEmpty());
-
-		xOpWrite0F(Prefix, isReallyAligned ? MovPS_OpAligned : MovPS_OpUnaligned, to, from);
+		if (!x86Emitter::use_avx)
+		{
+			// movaps is shorter, and no processor differentiates between the various movs for load/store
+			const bool aligned = std::bit_cast<u32>(op->aligned_load) == std::bit_cast<u32>(op->unaligned_load);
+			return aligned ? xMOVAPS : xMOVUPS;
+		}
+		return *op;
 	}
 
-	void xImplSimd_MoveSSE::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const
+	void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xRegisterSSE& src) const
 	{
-		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
-		bool isReallyAligned = isAligned || ((to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty());
-		xOpWrite0F(Prefix, isReallyAligned ? MovPS_OpAligned + 1 : MovPS_OpUnaligned + 1, from, to);
+		if (dst.GetId() == src.GetId() && dst.GetOperandSize() == src.GetOperandSize())
+			return;
+		SIMDInstructionInfo info = aligned_load;
+		const xRegisterSSE* arg0 = &dst;
+		const xRegisterSSE* arg1 = &src;
+		if (x86Emitter::use_avx)
+		{
+			if (arg1->IsExtended() && !arg0->IsExtended())
+			{
+				// Can save a byte by using the store opcode
+				info = aligned_store;
+				std::swap(arg0, arg1);
+			}
+		}
+		EmitSIMD(info, *arg0, *arg0, *arg1);
 	}
 
-	static const u8 MovDQ_PrefixAligned = 0x66; // Aligned [dqa] form
-	static const u8 MovDQ_PrefixUnaligned = 0xf3; // unaligned [dqu] form
-
-	void xImplSimd_MoveDQ::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const
+	void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xIndirectVoid& src) const
 	{
-		if (to != from)
-			xOpWrite0F(MovDQ_PrefixAligned, 0x6f, to, from);
+		const xImplSimd_MoveSSE& op = GetLoadStoreOp(this);
+		EmitSIMD(IsAligned(dst, src) ? op.aligned_load : op.unaligned_load, dst, dst, src);
 	}
 
-	void xImplSimd_MoveDQ::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const
+	void xImplSimd_MoveSSE::operator()(const xIndirectVoid& dst, const xRegisterSSE& src) const
 	{
-		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
-		bool isReallyAligned = isAligned || ((from.Displacement & 0x0f) == 0 && from.Index.IsEmpty() && from.Base.IsEmpty());
-		xOpWrite0F(isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x6f, to, from);
-	}
-
-	void xImplSimd_MoveDQ::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const
-	{
-		// ModSib form is aligned if it's displacement-only and the displacement is aligned:
-		bool isReallyAligned = isAligned || ((to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty());
-
-		// use opcode 0x7f : alternate ModRM encoding (reverse src/dst)
-		xOpWrite0F(isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x7f, from, to);
+		const xImplSimd_MoveSSE& op = GetLoadStoreOp(this);
+		EmitSIMD(IsAligned(src, dst) ? aligned_store : op.unaligned_store, src, src, dst);
 	}
 
 	void xImplSimd_PMove::BW(const xRegisterSSE& to, const xRegisterSSE& from) const { OpWriteSSE(0x66, OpcodeBase); }
@@ -715,21 +717,39 @@ namespace x86Emitter
 	void xImplSimd_PMove::DQ(const xRegisterSSE& to, const xIndirect64& from) const { OpWriteSSE(0x66, OpcodeBase + 0x500); }
 
 
-	const xImplSimd_MoveSSE xMOVAPS = {0x00, true};
-	const xImplSimd_MoveSSE xMOVUPS = {0x00, false};
+	const xImplSimd_MoveSSE xMOVAPS = {
+		SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
+		SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
+	};
+	const xImplSimd_MoveSSE xMOVUPS = {
+		SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
+		SIMDInstructionInfo(0x10).mov(), SIMDInstructionInfo(0x11).mov(),
+	};
 
 #ifdef ALWAYS_USE_MOVAPS
-	const xImplSimd_MoveSSE xMOVDQA = {0x00, true};
-	const xImplSimd_MoveSSE xMOVAPD = {0x00, true};
+	const xImplSimd_MoveSSE xMOVDQA = xMOVAPS;
+	const xImplSimd_MoveSSE xMOVAPD = xMOVAPS;
 
-	const xImplSimd_MoveSSE xMOVDQU = {0x00, false};
-	const xImplSimd_MoveSSE xMOVUPD = {0x00, false};
+	const xImplSimd_MoveSSE xMOVDQU = xMOVUPS;
+	const xImplSimd_MoveSSE xMOVUPD = xMOVUPS;
 #else
-	const xImplSimd_MoveDQ xMOVDQA = {0x66, true};
-	const xImplSimd_MoveSSE xMOVAPD = {0x66, true};
+	const xImplSimd_MoveSSE xMOVDQA = {
+		SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
+		SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
+	};
+	const xImplSimd_MoveSSE xMOVDQU = {
+		SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
+		SIMDInstructionInfo(0x6f).pf3().mov(), SIMDInstructionInfo(0x7f).pf3().mov(),
+	};
 
-	const xImplSimd_MoveDQ xMOVDQU = {0xf3, false};
-	const xImplSimd_MoveSSE xMOVUPD = {0x66, false};
+	const xImplSimd_MoveSSE xMOVAPD = {
+		SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
+		SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
+	};
+	const xImplSimd_MoveSSE xMOVUPD = {
+		SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
+		SIMDInstructionInfo(0x10).p66().mov(), SIMDInstructionInfo(0x11).p66().mov(),
+	};
 #endif
 
 
diff --git a/tests/ctest/common/x86emitter/codegen_tests_main.cpp b/tests/ctest/common/x86emitter/codegen_tests_main.cpp
index eb691aa920..bb678e0dab 100644
--- a/tests/ctest/common/x86emitter/codegen_tests_main.cpp
+++ b/tests/ctest/common/x86emitter/codegen_tests_main.cpp
@@ -337,10 +337,32 @@ TEST(CodegenTests, SSETest)
 	CODEGEN_TEST(xMOVHL.PS(xmm4, xmm9),        "41 0f 12 e1");
 	CODEGEN_TEST(xMOVLH.PS(xmm2, xmm1),        "0f 16 d1");
 
-	CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
-	CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
-	CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
-	CODEGEN_TEST(xMOVAPS(ptr128[rax+r9], xmm8), "46 0f 29 04 08");
+	CODEGEN_TEST(xMOVAPS(xmm0, xmm8),     "41 0f 28 c0");
+	CODEGEN_TEST(xMOVUPS(xmm8, xmm3),     "44 0f 28 c3");
+	CODEGEN_TEST(xMOVAPS(ptr[r8], xmm4),  "41 0f 29 20");
+	CODEGEN_TEST(xMOVUPS(ptr[rax], xmm5), "0f 11 28");
+	CODEGEN_TEST(xMOVAPS(xmm8, ptr[r8]),  "45 0f 28 00");
+	CODEGEN_TEST(xMOVUPS(xmm5, ptr[r9]),  "41 0f 10 29");
+	CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "44 0f 29 01");
+	CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "45 0f 11 18");
+	CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "45 0f 28 39");
+	CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "0f 10 08");
+	CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0),  "41 0f 29 01");
+	CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3),  "41 0f 11 18");
+	CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "44 0f 28 06");
+	CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "0f 10 39");
+#ifdef ALWAYS_USE_MOVAPS
+	CODEGEN_TEST(xMOVAPD(xmm4, xmm8),     "41 0f 28 e0");
+	CODEGEN_TEST(xMOVUPD(xmm1, xmm4),     "0f 28 cc");
+	CODEGEN_TEST(xMOVDQA(xmm9, xmm11),    "45 0f 28 cb");
+	CODEGEN_TEST(xMOVDQU(xmm7, xmm10),    "41 0f 28 fa");
+#else
+	CODEGEN_TEST(xMOVAPD(xmm4, xmm8),     "66 41 0f 28 e0");
+	CODEGEN_TEST(xMOVUPD(xmm1, xmm4),     "66 0f 28 cc");
+	CODEGEN_TEST(xMOVDQA(xmm9, xmm11),    "66 45 0f 6f cb");
+	CODEGEN_TEST(xMOVDQU(xmm7, xmm10),    "66 41 0f 6f fa");
+#endif
+
 	CODEGEN_TEST(xBLEND.PS(xmm0, xmm1, 0x55), "66 0f 3a 0c c1 55");
 	CODEGEN_TEST(xBLEND.PD(xmm8, xmm9, 0xaa), "66 45 0f 3a 0d c1 aa");
 	CODEGEN_TEST(xPBLEND.W(xmm0, xmm1, 0x55), "66 0f 3a 0e c1 55");
@@ -545,6 +567,40 @@ TEST(CodegenTests, AVXTest)
 	CODEGEN_TEST(xMOVHL.PS(xmm4, xmm9),        "c4 c1 58 12 e1");
 	CODEGEN_TEST(xMOVLH.PS(xmm2, xmm1),        "c5 e8 16 d1");
 
+	CODEGEN_TEST(xMOVAPS(xmm0, xmm8),     "c5 78 29 c0");
+	CODEGEN_TEST(xMOVUPS(xmm8, xmm3),     "c5 78 28 c3");
+	CODEGEN_TEST(xMOVAPS(ptr[r8], xmm4),  "c4 c1 78 29 20");
+	CODEGEN_TEST(xMOVUPS(ptr[rax], xmm5), "c5 f8 11 28");
+	CODEGEN_TEST(xMOVAPS(xmm8, ptr[r8]),  "c4 41 78 28 00");
+	CODEGEN_TEST(xMOVUPS(xmm5, ptr[r9]),  "c4 c1 78 10 29");
+#ifdef ALWAYS_USE_MOVAPS
+	CODEGEN_TEST(xMOVAPD(xmm4, xmm8),     "c5 78 29 c4");
+	CODEGEN_TEST(xMOVUPD(xmm1, xmm4),     "c5 f8 28 cc");
+	CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "c5 78 29 01");
+	CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "c4 41 78 11 18");
+	CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "c4 41 78 28 39");
+	CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "c5 f8 10 08");
+	CODEGEN_TEST(xMOVDQA(xmm9, xmm11),    "c4 41 78 28 cb");
+	CODEGEN_TEST(xMOVDQU(xmm7, xmm10),    "c5 78 29 d7");
+	CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0),  "c4 c1 78 29 01");
+	CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3),  "c4 c1 78 11 18");
+	CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "c5 78 28 06");
+	CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "c5 f8 10 39");
+#else
+	CODEGEN_TEST(xMOVAPD(xmm4, xmm8),     "c5 79 29 c4");
+	CODEGEN_TEST(xMOVUPD(xmm1, xmm4),     "c5 f9 28 cc");
+	CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "c5 79 29 01");
+	CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "c4 41 79 11 18");
+	CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "c4 41 79 28 39");
+	CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "c5 f9 10 08");
+	CODEGEN_TEST(xMOVDQA(xmm9, xmm11),    "c4 41 79 6f cb");
+	CODEGEN_TEST(xMOVDQU(xmm7, xmm10),    "c5 79 7f d7");
+	CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0),  "c4 c1 79 7f 01");
+	CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3),  "c4 c1 7a 7f 18");
+	CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "c5 79 6f 06");
+	CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "c5 fa 6f 39");
+#endif
+
 	CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
 	CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
 	CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");