From 29109d25af75d50f2a1cad2ef43eb31187f914f9 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 20 Apr 2013 01:11:40 -0700
Subject: [PATCH] Non-optimal vmmul for x86 jit.

It's faster than interpreter anyway, but it could be much better.
---
 Core/MIPS/ARM/ArmCompVFPU.cpp |  4 +++
 Core/MIPS/ARM/ArmJit.h        |  1 +
 Core/MIPS/MIPSTables.cpp      |  8 +++---
 Core/MIPS/x86/CompVFPU.cpp    | 48 +++++++++++++++++++++++++++++++++++
 Core/MIPS/x86/Jit.h           |  1 +
 5 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/Core/MIPS/ARM/ArmCompVFPU.cpp b/Core/MIPS/ARM/ArmCompVFPU.cpp
index 21d294cdb..bf59bb6a3 100644
--- a/Core/MIPS/ARM/ArmCompVFPU.cpp
+++ b/Core/MIPS/ARM/ArmCompVFPU.cpp
@@ -718,4 +718,8 @@ namespace MIPSComp
 		DISABLE;
 	}
 
+	void Jit::Comp_Vmmul(u32 op) {
+		DISABLE;
+	}
+
 }
diff --git a/Core/MIPS/ARM/ArmJit.h b/Core/MIPS/ARM/ArmJit.h
index b2cb72beb..1e726db71 100644
--- a/Core/MIPS/ARM/ArmJit.h
+++ b/Core/MIPS/ARM/ArmJit.h
@@ -194,6 +194,7 @@ public:
 	void Comp_Vmtvc(u32 op);
 	void Comp_Vmmov(u32 op);
 	void Comp_VScl(u32 op);
+	void Comp_Vmmul(u32 op);
 
 	ArmJitBlockCache *GetBlockCache() { return &blocks; }
 
diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp
index 8b347554e..f52773591 100644
--- a/Core/MIPS/MIPSTables.cpp
+++ b/Core/MIPS/MIPSTables.cpp
@@ -626,10 +626,10 @@ MIPSInstruction tableVFPU5[8] =  //110111 xxx
 const MIPSInstruction tableVFPU6[32] =  //111100 xxx
 {
 //0
-	INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
-	INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
-	INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
-	INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
+	INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
+	INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
+	INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
+	INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX),
 
 	INSTR("v(h)tfm2",&Jit::Comp_Generic, Dis_Vtfm, Int_Vtfm, IS_VFPU|OUT_EAT_PREFIX),
 	INSTR("v(h)tfm2",&Jit::Comp_Generic, Dis_Vtfm, Int_Vtfm, IS_VFPU|OUT_EAT_PREFIX),
diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index e8a97249c..2c1c73333 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -783,4 +783,52 @@ void Jit::Comp_VScl(u32 op) {
 	fpr.ReleaseSpillLocks();
 }
 
+void Jit::Comp_Vmmul(u32 op) {
+	CONDITIONAL_DISABLE;
+
+	// TODO: This probably ignores prefixes?
+	if (js.MayHavePrefix())
+		DISABLE;
+
+	MatrixSize sz = GetMtxSize(op);
+	int n = GetMatrixSide(sz);
+
+	u8 sregs[16], tregs[16], dregs[16];
+	GetMatrixRegs(sregs, sz, _VS);
+	GetMatrixRegs(tregs, sz, _VT);
+	GetMatrixRegs(dregs, sz, _VD);
+
+	// TODO: test overlap, fix non-optimal.
+	u8 tempregs[16];
+	for (int a = 0; a < n; a++)
+	{
+		for (int b = 0; b < n; b++)
+		{
+			XORPS(XMM0, R(XMM0));
+			for (int c = 0; c < n; c++)
+			{
+				MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
+				MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
+				ADDSS(XMM0, R(XMM1));
+			}
+			u8 temp = (u8) fpr.GetTempV();
+			fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
+			MOVSS(fpr.VX(temp), R(XMM0));
+			fpr.StoreFromRegisterV(temp);
+			tempregs[a * 4 + b] = temp;
+		}
+	}
+	for (int a = 0; a < n; a++)
+	{
+		for (int b = 0; b < n; b++)
+		{
+			u8 temp = tempregs[a * 4 + b];
+			fpr.MapRegV(temp, 0);
+			MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
+		}
+	}
+
+	fpr.ReleaseSpillLocks();
+}
+
 }
diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h
index 8a1c8cd97..f7f1121bf 100644
--- a/Core/MIPS/x86/Jit.h
+++ b/Core/MIPS/x86/Jit.h
@@ -204,6 +204,7 @@ public:
 	void Comp_Vmtvc(u32 op);
 	void Comp_Vmmov(u32 op);
 	void Comp_VScl(u32 op);
+	void Comp_Vmmul(u32 op);
 
 	void Comp_DoNothing(u32 op);