Merge pull request #2093 from xsacha/armjit-vfpu

Armjit: Improve ApplyPrefixD. Add VABD to emitter.
2025-02-26 08:55:58 +00:00 · 2013-06-05 10:30:02 -07:00 · 2013-06-05 10:30:02 -07:00 · 7574ebbe58
commit 7574ebbe58
parent 5ba64b586a 10c976b2af
4 changed files with 35 additions and 35 deletions
--- a/Common/ArmEmitter.cpp
+++ b/Common/ArmEmitter.cpp
@ -850,6 +850,21 @@ ARMReg ARMXEmitter::SubBase(ARMReg Reg)
 }

 // NEON Specific
+void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
+	_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
+	bool register_quad = Vd >= Q0;
+
+	// Gets encoded as a double register
+	Vd = SubBase(Vd);
+	Vn = SubBase(Vn);
+	Vm = SubBase(Vm);
+
+	Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
+		| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
+		| ((Vm & 0x10) << 2) | (Vm & 0xF));
+}
 void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
@ -864,7 +879,7 @@ void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
 		| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
-		| ((Vm & 0x10) << 2) | (Vm & 0xF)); 
+		| ((Vm & 0x10) << 2) | (Vm & 0xF));

 }
 void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
@ -879,7 +894,7 @@ void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
 		| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
-		| ((Vm & 0x10) << 2) | (Vm & 0xF)); 
+		| ((Vm & 0x10) << 2) | (Vm & 0xF));
 }

 // VFP Specific
--- a/Common/ArmEmitter.h
+++ b/Common/ArmEmitter.h
@ -530,6 +530,7 @@ public:
 	// Subtracts the base from the register to give us the real one
 	ARMReg SubBase(ARMReg Reg);	
 	// NEON Only
+	void VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);

--- a/Core/MIPS/ARM/ArmCompVFPU.cpp
+++ b/Core/MIPS/ARM/ArmCompVFPU.cpp
@ -17,6 +17,7 @@

 #include "../../MemMap.h"
 #include "../MIPSAnalyst.h"
+#include "Common/CPUDetect.h"
 #include "Core/Config.h"
 #include "Core/Reporting.h"

@ -166,39 +167,22 @@ namespace MIPSComp

 			int sat = (js.prefixD >> (i * 2)) & 3;
 			if (sat == 1) {
+				// clamped = fabs(x) - fabs(x-0.5f) + 0.5f; // [ 0, 1]
 				fpr.MapRegV(vregs[i], MAP_DIRTY);
-				// ARGH this is a pain - no MIN/MAX in non-NEON VFP!
-				// NEON does have min/max though so this should only be a fallback.
-				MOVI2F(S0, 0.0, R0);
-				MOVI2F(S1, 1.0, R0);
-				VCMP(fpr.V(vregs[i]), S1);
-				VMRS_APSR();
-				SetCC(CC_GE);
-				VMOV(fpr.V(vregs[i]), S1);
-				FixupBranch skip = B();
-				SetCC(CC_AL);
-				VCMP(fpr.V(vregs[i]), S0);
-				VMRS_APSR();
-				SetCC(CC_LE);
-				VMOV(fpr.V(vregs[i]), S0);
-				SetCC(CC_AL);
-				SetJumpTarget(skip);
+				MOVI2F(S0, 0.5, R0);
+				VABS(S1, fpr.V(vregs[i]));     // S1 = fabs(x)
+				VSUB(S2, fpr.V(vregs[i]), S0); // S2 = fabs(x-0.5f) {VABD}
+				VABS(S2, S2);
+				VSUB(fpr.V(vregs[i]), S1, S2); // v[i] = S1 - S2 + 0.5f
+				VADD(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
 			} else if (sat == 3) {
+				// clamped = fabs(x) - fabs(x-1.0f);        // [-1, 1]
 				fpr.MapRegV(vregs[i], MAP_DIRTY);
-				MOVI2F(S0, -1.0, R0);
-				MOVI2F(S1, 1.0, R0);
-				VCMP(fpr.V(vregs[i]), S1);
-				VMRS_APSR();
-				SetCC(CC_GE);
-				VMOV(fpr.V(vregs[i]), S1);
-				FixupBranch skip = B();
-				SetCC(CC_AL);
-				VCMP(fpr.V(vregs[i]), S0);
-				VMRS_APSR();
-				SetCC(CC_LE);
-				VMOV(fpr.V(vregs[i]), S0);
-				SetCC(CC_AL);
-				SetJumpTarget(skip);
+				MOVI2F(S0, 1.0, R0);
+				VABS(S1, fpr.V(vregs[i]));     // S1 = fabs(x)
+				VSUB(S2, fpr.V(vregs[i]), S0); // S2 = fabs(x-1.0f) {VABD}
+				VABS(S2, S2);
+				VSUB(fpr.V(vregs[i]), S1, S2); // v[i] = S1 - S2
 			}
 		}
 	}
--- a/Core/MIPS/MIPSIntVFPU.cpp
+++ b/Core/MIPS/MIPSIntVFPU.cpp
@ -115,9 +115,9 @@ return floor(x+.5);

 void ApplyPrefixST(float *v, u32 data, VectorSize size)
 {
-  // Possible optimization shortcut:
-  if (data == 0xe4)
-    return;
+	// Possible optimization shortcut:
+	if (data == 0xe4)
+		return;

 	int n = GetNumVectorElements(size);
 	float origV[4];