Merge pull request #707 from unknownbrackets/jit-vfpu

Copy over temp fpu regs, enable VecDo3/VDot
2024-11-23 21:39:52 +00:00 · 2013-02-16 15:35:34 -08:00 · 2013-02-16 15:35:34 -08:00 · 1ffde31328
commit 1ffde31328
parent 9338cf9740 b27701ac7d
4 changed files with 129 additions and 24 deletions
--- a/Core/MIPS/MIPSIntVFPU.cpp
+++ b/Core/MIPS/MIPSIntVFPU.cpp
@ -113,7 +113,10 @@ void ApplyPrefixST(float *v, u32 data, VectorSize size)
 			// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
 			// TODO: But some ops seem to use const 0 instead?
 			if (regnum >= n)
+			{
+				ERROR_LOG(CPU, "Invalid VFPU swizzle: %08x / %d", data, size);
 				regnum = 0;
+			}

 			v[i] = origV[regnum];
 			if (abs)
@ -1185,7 +1188,11 @@ namespace MIPSInt
 		ReadVector(s, sz, vs);
 		ApplySwizzleS(s, sz);
 		float scale = V(vt);
-		ApplySwizzleT(&scale, V_Single);
+		if (currentMIPS->vfpuCtrl[VFPU_CTRL_TPREFIX] != 0xE4)
+		{
+			WARN_LOG(CPU, "Broken T prefix used with VScl: %08x / %08x", currentMIPS->vfpuCtrl[VFPU_CTRL_TPREFIX], op);
+			ApplySwizzleT(&scale, V_Single);
+		}
 		int n = GetNumVectorElements(sz);
 		for (int i = 0; i < n; i++)
 		{
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -49,7 +49,7 @@ namespace MIPSComp

 static const float one = 1.0f;
 static const float minus_one = -1.0f;
-static const float zero = -1.0f;
+static const float zero = 0.0f;

 const u32 GC_ALIGNED16( noSignMask[4] ) = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
 const u32 GC_ALIGNED16( signBitLower[4] ) = {0x80000000, 0, 0, 0};
@ -144,6 +144,25 @@ void Jit::ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWrit
 	}
 }

+// Vector regs can overlap in all sorts of swizzled ways.
+// This does allow a single overlap in sregs[i].
+bool DestRegOverlaps(int dreg, int di, int sn, u8 sregs[], int tn, u8 tregs[])
+{
+	for (int i = 0; i < sn; ++i)
+	{
+		if (sregs[i] == dreg && i != di)
+			return true;
+	}
+	for (int i = 0; i < tn; ++i)
+	{
+		if (tregs[i] == dreg)
+			return true;
+	}
+
+	// Hurray, no overlap, we can write directly.
+	return false;
+}
+
 static u32 GC_ALIGNED16(ssLoadStoreTemp[1]);

 void Jit::Comp_SV(u32 op) {
@ -292,7 +311,7 @@ void Jit::Comp_SVQ(u32 op)
 }

 void Jit::Comp_VDot(u32 op) {
-	DISABLE;
+	CONDITIONAL_DISABLE;

 	// WARNING: No prefix support!
 	if (js.MayHavePrefix()) {
@ -314,32 +333,42 @@ void Jit::Comp_VDot(u32 op) {

 	// TODO: applyprefixST here somehow (shuffle, etc...)

-	MOVSS(XMM0, fpr.V(sregs[0]));
-	MULSS(XMM0, fpr.V(tregs[0]));
-
 	int n = GetNumVectorElements(sz);
+	X64Reg tempxreg = XMM0;
+	if (!DestRegOverlaps(dregs[0], 0, n, sregs, n, tregs))
+	{
+		fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
+		tempxreg = fpr.VX(dregs[0]);
+	}
+
+	MOVSS(tempxreg, M((void *) &zero));
+	MOVSS(XMM1, fpr.V(sregs[0]));
+	MULSS(XMM1, fpr.V(tregs[0]));
+	ADDSS(tempxreg, R(XMM1));
+
 	for (int i = 1; i < n; i++)
 	{
 		// sum += s[i]*t[i];
 		MOVSS(XMM1, fpr.V(sregs[i]));
 		MULSS(XMM1, fpr.V(tregs[i]));
-		ADDSS(XMM0, R(XMM1));
+		ADDSS(tempxreg, R(XMM1));
 	}
-	fpr.ReleaseSpillLocks();

-	fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
+	if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg))
+	{
+		fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
+		MOVSS(fpr.V(dregs[0]), XMM0);
+	}

 	// TODO: applyprefixD here somehow (write mask etc..)

-	MOVSS(fpr.V(vd), XMM0);
-
 	fpr.ReleaseSpillLocks();

 	js.EatPrefix();
 }

 void Jit::Comp_VecDo3(u32 op) {
-	DISABLE;
+	CONDITIONAL_DISABLE;

 	// WARNING: No prefix support!
 	if (js.MayHavePrefix())
@ -394,16 +423,42 @@ void Jit::Comp_VecDo3(u32 op) {
 	}

 	int n = GetNumVectorElements(sz);
-	// We need at least n temporaries...
-	if (n > 2)
-		fpr.Flush();
+
+	X64Reg tempxregs[4];
+	for (int i = 0; i < n; ++i)
+	{
+		if (DestRegOverlaps(dregs[i], i, n, sregs, n, tregs))
+		{
+			// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.
+			if (i < 2)
+				tempxregs[i] = (X64Reg) (XMM0 + i);
+			else
+			{
+				fpr.BindToRegister(TEMP0 + i, false, true);
+				fpr.SpillLock(TEMP0 + i);
+				tempxregs[i] = fpr.RX(TEMP0 + i);
+			}
+		}
+		else
+		{
+			fpr.MapRegV(dregs[i], (dregs[i] == sregs[i] ? 0 : MAP_NOINIT) | MAP_DIRTY);
+			fpr.SpillLockV(dregs[i]);
+			tempxregs[i] = fpr.VX(dregs[i]);
+		}
+	}

 	for (int i = 0; i < n; ++i)
-		MOVSS((X64Reg) (XMM0 + i), fpr.V(sregs[i]));
+	{
+		if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
+			MOVSS(tempxregs[i], fpr.V(sregs[i]));
+	}
 	for (int i = 0; i < n; ++i)
-		(this->*xmmop)((X64Reg) (XMM0 + i), fpr.V(tregs[i]));
+		(this->*xmmop)(tempxregs[i], fpr.V(tregs[i]));
 	for (int i = 0; i < n; ++i)
-		MOVSS(fpr.V(dregs[i]), (X64Reg) (XMM0 + i));
+	{
+		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
+			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
+	}

 	fpr.ReleaseSpillLocks();

--- a/Core/MIPS/x86/RegCacheFPU.cpp
+++ b/Core/MIPS/x86/RegCacheFPU.cpp
@ -82,6 +82,8 @@ void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
 void FPURegCache::ReleaseSpillLocks() {
 	for (int i = 0; i < NUM_MIPS_FPRS; i++)
 		regs[i].locked = false;
+	for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i)
+		DiscardR(i);
 }

 void FPURegCache::BindToRegister(const int i, bool doLoad, bool makeDirty) {
@ -97,7 +99,9 @@ void FPURegCache::BindToRegister(const int i, bool doLoad, bool makeDirty) {
 			if (!regs[i].location.IsImm() && (regs[i].location.offset & 0x3)) {
 				PanicAlert("WARNING - misaligned fp register location %i", i);
 			}
-			emit->MOVSS(xr, regs[i].location);
+			if (i < TEMP0) {
+				emit->MOVSS(xr, regs[i].location);
+			}
 		}
 		regs[i].location = newloc;
 		regs[i].away = true;
@ -124,8 +128,27 @@ void FPURegCache::StoreFromRegister(int i) {
 	}
 }

+void FPURegCache::DiscardR(int i) {
+	_assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "FPU can't handle imm yet.");
+	if (regs[i].away) {
+		X64Reg xr = regs[i].location.GetSimpleReg();
+		_assert_msg_(DYNA_REC, xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");
+		// Note that we DO NOT write it back here. That's the whole point of Discard.
+		xregs[xr].dirty = false;
+		xregs[xr].mipsReg = -1;
+		regs[i].location = GetDefaultLocation(i);
+		regs[i].away = false;
+	} else {
+		//	_assert_msg_(DYNA_REC,0,"already stored");
+	}
+}
+
+bool FPURegCache::IsTemp(X64Reg xr) {
+	return xregs[xr].mipsReg >= TEMP0;
+}
+
 void FPURegCache::Flush() {
-	for (int i = 0; i < NUM_MIPS_FPRS; i++) {
+	for (int i = 0; i < TEMP0; i++) {
 		if (regs[i].locked) {
 			PanicAlert("Somebody forgot to unlock MIPS reg %i.", i);
 		}
@ -141,6 +164,9 @@ void FPURegCache::Flush() {
 			}
 		}
 	}
+	for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
+		DiscardR(i);
+	}
 }

 OpArg FPURegCache::GetDefaultLocation(int reg) const {
@ -203,7 +229,7 @@ X64Reg FPURegCache::GetFreeXReg() {
 	return (X64Reg) -1;
 }

-void FPURegCache::FlushR(X64Reg reg) {
+void FPURegCache::FlushX(X64Reg reg) {
 	if (reg >= NUM_X_FPREGS)
 		PanicAlert("Flushing non existent reg");
 	if (xregs[reg].mipsReg != -1) {
--- a/Core/MIPS/x86/RegCacheFPU.h
+++ b/Core/MIPS/x86/RegCacheFPU.h
@ -26,9 +26,18 @@ using namespace Gen;


 // GPRs are numbered 0 to 31
-// VFPU regs are numbered 32 to 160.
+// VFPU regs are numbered 32 to 159.
+// Then we have some temp regs for VFPU handling from 160 to 167.

-#define NUM_MIPS_FPRS (32 + 128)
+enum {
+	NUM_TEMPS = 4,
+	TEMP0 = 32 + 128,
+	TEMP1 = TEMP0 + 1,
+	TEMP2 = TEMP0 + 2,
+	TEMP3 = TEMP0 + 3,
+	TEMP4 = TEMP0 + 4,
+	NUM_MIPS_FPRS = 32 + 128 + NUM_TEMPS,
+};

 #ifdef _M_X64
 #define NUM_X_FPREGS 16
@ -68,6 +77,11 @@ public:
 		StoreFromRegister(preg + 32);
 	}
 	OpArg GetDefaultLocation(int reg) const;
+	void DiscardR(int freg);
+	void DiscardV(int vreg) {
+		DiscardR(vreg + 32);
+	}
+	bool IsTemp(X64Reg xreg);

 	void SetEmitter(XEmitter *emitter) {emit = emitter;}

@ -100,6 +114,9 @@ public:
 	void MapRegV(int vreg, int flags);
 	void MapRegsV(int vec, VectorSize vsz, int flags);
 	void MapRegsV(const u8 *v, VectorSize vsz, int flags);
+	void SpillLockV(int vreg) {
+		SpillLock(vreg + 32);
+	}
 	void SpillLockV(const u8 *v, VectorSize vsz);
 	void SpillLockV(int vec, VectorSize vsz);

@ -107,7 +124,7 @@ public:

 private:
 	X64Reg GetFreeXReg();
-	void FlushR(X64Reg reg); 
+	void FlushX(X64Reg reg);
 	const int *GetAllocationOrder(int &count);

 	MIPSCachedFPReg regs[NUM_MIPS_FPRS];